# Data Science Challenge - Parsing the Connect advertising texts with NLTK

## A quick tutorial

In [None]:
################################## Preconditions for running this iPython notebook ##########################################
### You need to have the Connect advertising text data set saved somewhere. In this tutorial it is renamed 'connect.txt'  ###
### You also need to have nltk package installed using pip or anaconda...                                                 ###
#############################################################################################################################

In [1]:
import nltk

In [2]:
from nltk.corpus import PlaintextCorpusReader

### Getting things ready

#### We have both German and English in the corpus. So let's re-do this, and just extract the lines of text that have English, for this excercise.

In [3]:
# this is normal Python stuff, no nltk magic here.
# just read in the file and if the line has "English" in it, then keep it, otherwise discard the entire line

### change the paths to your own paths to these files

oldfile = r'C:\Users\lon\Desktop\connect.txt'         # this is the original connect advertising texts data set
newfile = r'C:\Users\lon\Desktop\connect_new.txt'     # this file doesn't exist yet, we're creating it

with open(oldfile,encoding='utf8') as old, open(newfile, 'w', encoding='utf8') as new:
    for line in old:
        if "English" in line:
            new.write(line)


#### Now we will get the text from this new, English-only file

In [4]:
#### change the paths to your own

corpus_root = r'C:\Users\lon\Desktop'
wordlists = PlaintextCorpusReader(corpus_root, 'connect_new.txt')

In [5]:
# returns a list of the words in the corpus
connect_en = wordlists.words('connect_new.txt')

In [6]:
print (connect_en)

['103', 'English', '?', '245', 'English', '?', '266', ...]


#### Get rid of pesky little words like "the" "it" "is" etc (called "stopwords" in computational linguistics)

In [7]:
# if you haven't already downloaded all the nltk corpora then this will download the 'stopwords' corpus for you
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
def strip_en_stopwords(text):
    stop_words = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stop_words]
    return content

In [9]:
connect_en = strip_en_stopwords(connect_en)

####  More cleaning, let's get rid of some common punctuation marks

In [11]:
punct=['.', ',', '-', ':', ';', '(', ')', '/', '?', '!', '+', '\'', '\"']

In [13]:
connect_en = [w for w in connect_en if w not in punct]

In [14]:
connect_en[:25]

['103',
 'English',
 '245',
 'English',
 '266',
 'English',
 '285',
 'English',
 '290',
 'English',
 'Benjamin',
 'Joseph',
 'LastName',
 '8',
 'years',
 'SOLID',
 'working',
 'experience',
 'software',
 'engineering',
 'experience',
 'various',
 'project',
 'types',
 'embedded']

#### and get rid of numbers

In [15]:
import re
connect_en = [item for item in connect_en if not re.search('\d', item)]

#### and get rid of the anonymised FirstName and LastName and the language identifier

In [16]:
connect_en = [item for item in connect_en if not re.search('FirstName|LastName|English', item)]

In [17]:
connect_en[:25]

['Benjamin',
 'Joseph',
 'years',
 'SOLID',
 'working',
 'experience',
 'software',
 'engineering',
 'experience',
 'various',
 'project',
 'types',
 'embedded',
 'systems',
 'Windows',
 'applications',
 'Web',
 'applications',
 'cloud',
 'backend',
 'applications',
 'experience',
 'working',
 'japanese',
 'swiss']

### Let's look at the most common collocations

In [18]:
### to do that, we need to convert our list to a nltk text, which has a lot of methods to do advanced analysis tasks
text = nltk.Text(connect_en)

In [19]:
text.collocations(25)     # print the 25 most common collocations (multi-word expressions)

Software Engineer; gained experience; several years; NET framework;
back end; Active Directory; Eclipse Helios; JSF JSP; Marketing Sales;
Motion Graphics; Product Owner; construction installation; firewalls
Active; unter Beweis; software engineer; Project Manager; Apache
Maven; Executive Search; among others; configuration administration;
geographically distributed; long term; mail servers; maintainance
network; servers firewalls


### Let's get nifty and find out the frequency distributions of the most common words

In [20]:
fdist = nltk.FreqDist(text)

In [21]:
fdist.most_common()[:25]

[('experience', 59),
 ('software', 40),
 ('development', 35),
 ('projects', 28),
 ('years', 28),
 ('project', 20),
 ('working', 18),
 ('design', 18),
 ('work', 18),
 ('also', 17),
 ('engineer', 17),
 ('developer', 16),
 ('well', 15),
 ('Software', 14),
 ('new', 13),
 ('web', 13),
 ('quality', 13),
 ('requirements', 13),
 ('professional', 13),
 ('skills', 13),
 ('several', 12),
 ('und', 12),
 ('solutions', 12),
 ('technologies', 12),
 ('agile', 12)]