# Data Science Challenge

## A quick and not very successful attempt to get some information from the ERNI Connect advertising texts

In [None]:
################################## Preconditions for running this iPython notebook ##########################################
### You need to have the Connect advertising text data set saved somewhere. In this tutorial it is renamed 'connect.txt'  ###
### You also need to have nltk package installed using pip or anaconda...                                                 ###
#############################################################################################################################

In [None]:
import nltk

In [None]:
from nltk.corpus import PlaintextCorpusReader

### Getting things ready

#### We have both German and English in the corpus. So let's re-do this, and just extract the lines of text that have English, for this excercise.

In [None]:
# this is normal Python stuff, no nltk magic here.
# just read in the file and if the line has "English" in it, then keep it, otherwise discard the entire line

### NOTE change the paths to your own paths to these files

oldfile = r'C:\Users\lon\Desktop\connect.txt'         # this is the original connect advertising texts data set
newfile = r'C:\Users\lon\Desktop\connect_new.txt'     # this file doesn't exist yet, we're creating it

with open(oldfile,encoding='utf8') as old, open(newfile, 'w', encoding='utf8') as new:
    for line in old:
        if "English" in line:
            new.write(line)


#### Now we will get the text from this new, English-only file

In [None]:
#### change the paths to your own

corpus_root = r'C:\Users\lon\Desktop'
wordlists = PlaintextCorpusReader(corpus_root, 'connect_new.txt')

In [None]:
# returns a list of the words in the corpus
connect_en = wordlists.words('connect_new.txt')

In [None]:
print (connect_en)

#### Get rid of pesky little words like "the" "it" "is" etc (called "stopwords" in computational linguistics)

In [None]:
# if you haven't already downloaded all the nltk corpora then this will download the 'stopwords' corpus for you
nltk.download('stopwords')

In [None]:
def strip_en_stopwords(text):
    stop_words = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stop_words]
    return content

In [None]:
connect_en = strip_en_stopwords(connect_en)

####  More cleaning, let's get rid of some common punctuation marks

In [None]:
punct=['.', ',', '-', ':', ';', '(', ')', '/', '?', '!', '+', '\'', '\"']

In [None]:
connect_en = [w for w in connect_en if w not in punct]

In [None]:
connect_en[:25]

#### and get rid of numbers

In [None]:
import re
connect_en = [item for item in connect_en if not re.search('\d', item)]

#### and get rid of the anonymised FirstName and LastName and the language identifier

In [None]:
connect_en = [item for item in connect_en if not re.search('FirstName|LastName|English', item)]

In [None]:
connect_en[:25]

### Most common collocations

In [None]:
### convert our list to a nltk text, which has a lot of methods to do advanced analysis tasks
text = nltk.Text(connect_en)

In [None]:
text.collocations(25)     # print the 25 most common collocations (multi-word expressions)

### Find out the frequency distributions of the most common words

In [None]:
fdist = nltk.FreqDist(text)

In [None]:
fdist.most_common()[:25]