In [1]:
# in this file we are goona work on the text data, extract features from text and build models

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer

from textblob import TextBlob

In [2]:
df = pd.read_csv(r'C:\Users\Adarsh\uOttawa\ML Course Project-7Apr\datasets\2_text_data.csv')

In [3]:
df.shape

(40000, 2)

In [4]:
df.columns

Index(['feedback', 'job_status'], dtype='object')

In [5]:
df.tail(20)

Unnamed: 0,feedback,job_status
39980,I have been working at Google full-time (More ...,1
39981,A behemoth in flux Best-in-class people. Reall...,1
39982,"Been there, done that. They make great, unique...",1
39983,Work Culture. 1) A lot to learn 2) High Bar 3)...,1
39984,Probably not the best but not the worst either...,1
39985,"Fast paced, fun, challenging You get to work w...",1
39986,"Great company, work/life flexibility, but too ...",1
39987,So much change in 3 years Fresh enthusiasm kee...,1
39988,Hardware engineer intern You're working with s...,1
39989,Program Manager Good pay. Work life balance de...,1


In [6]:
# 1. Feature extraction : number of words
# basic idea is to extract number words in each title 
# intuition behind with this technique is - some collection/categories need longer title than others

# make a new feature  word_count and calculate the words by splitting the strin on white space
df['feedback_word_count'] = df['feedback'].apply( lambda x : len(str(x).split(" ") ) )

df.head(5)

Unnamed: 0,feedback,job_status,feedback_word_count
0,Early opinions of working at amazon Exciting. ...,0,88
1,Amazing company to work for! Very friendly env...,0,104
2,Different and great company - you get huge res...,0,151
3,Better than your average sweat shop! Good bene...,0,153
4,"Great Company to Work For, Highly Recommended ...",0,27


In [7]:
# 2. Feature : Average word length (Sum categories might need longer words than others)
# formula : Sum (length of all the words in the tweet or doc ) / (total length of the tweet or doc)

def avg_word_length(sentence):
    words = sentence.split()
    return ( sum( len(word) for word in words ) / len(words))

df['avg_word_len'] = df['feedback'].apply( lambda x: avg_word_length(x) )

# print
df.head(5)

Unnamed: 0,feedback,job_status,feedback_word_count,avg_word_len
0,Early opinions of working at amazon Exciting. ...,0,88,4.534091
1,Amazing company to work for! Very friendly env...,0,104,4.336538
2,Different and great company - you get huge res...,0,151,4.516556
3,Better than your average sweat shop! Good bene...,0,153,4.895425
4,"Great Company to Work For, Highly Recommended ...",0,27,6.37037


In [8]:
# change type of feedback from object to string to avoid future errors
df['feedback'] = df['feedback'].astype('str') 

In [9]:
def sentiment(x):
    sentiment = TextBlob(x)
    return sentiment.sentiment.polarity

df['sentiment_score'] = df['feedback'].apply(sentiment)


In [10]:
df.head(5)

Unnamed: 0,feedback,job_status,feedback_word_count,avg_word_len,sentiment_score
0,Early opinions of working at amazon Exciting. ...,0,88,4.534091,0.124329
1,Amazing company to work for! Very friendly env...,0,104,4.336538,0.448133
2,Different and great company - you get huge res...,0,151,4.516556,0.37381
3,Better than your average sweat shop! Good bene...,0,153,4.895425,0.199087
4,"Great Company to Work For, Highly Recommended ...",0,27,6.37037,0.48


In [12]:
# assign numeric values to the sentiments

sentiment_numeric = []

for score in df['sentiment_score']:
    # positive sentiments will have value "+1"
    if score > 0:
        sentiment_numeric.append(1)
    # positive sentiments will have value "-1"
    elif score < 0:
        sentiment_numeric.append(-1)
    # positive sentiments will have value "0"
    elif score == 0:
        sentiment_numeric.append(0)

df['sentiment'] = sentiment_numeric
df.sentiment.value_counts()

 1    36486
-1     2794
 0      720
Name: sentiment, dtype: int64

In [13]:
df.head(5)

Unnamed: 0,feedback,job_status,feedback_word_count,avg_word_len,sentiment_score,sentiment
0,Early opinions of working at amazon Exciting. ...,0,88,4.534091,0.124329,1
1,Amazing company to work for! Very friendly env...,0,104,4.336538,0.448133,1
2,Different and great company - you get huge res...,0,151,4.516556,0.37381,1
3,Better than your average sweat shop! Good bene...,0,153,4.895425,0.199087,1
4,"Great Company to Work For, Highly Recommended ...",0,27,6.37037,0.48,1


In [14]:
# let's start cleaning the data 

In [15]:
# 1. Lower casing  - change all the words to lower case to avoid duplication. Because "Python" and "python" considered 2 words
# we will split the title into words and then convert those words into lower case and then join

df['feedback'] = df['feedback'].apply( lambda x : " ".join( x.lower() for x in x.split() ) )
df['feedback'].head(5)

0    early opinions of working at amazon exciting. ...
1    amazing company to work for! very friendly env...
2    different and great company - you get huge res...
3    better than your average sweat shop! good bene...
4    great company to work for, highly recommended ...
Name: feedback, dtype: object

In [16]:
# 2. Remove punctuations and special charactaers
df['feedback'] = df['feedback'].str.replace('[^\w\s]','')
# the [^\w\s] means remove everything, keep only words(w) and spaces(s)
# this step should be done after feature extraction like hashtags, user tagged
df['feedback'].head()

0    early opinions of working at amazon exciting i...
1    amazing company to work for very friendly envi...
2    different and great company  you get huge resp...
3    better than your average sweat shop good benef...
4    great company to work for highly recommended p...
Name: feedback, dtype: object

In [17]:
# 3 . Removal of stop words - the, a , and etc. These are most commonly occuring words, and may created irrevelent 
# baises to our model

#import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

# we are gonna split the title into words and then filter the stop words and join them back
df['feedback'] = df['feedback'].apply( lambda x : " ".join( x for x in x.split() if x not in stop) )
df['feedback'].head(5)

0    early opinions working amazon exciting right t...
1    amazing company work friendly environment empl...
2    different great company get huge responsibilit...
3    better average sweat shop good benefits sharp ...
4    great company work highly recommended pay work...
Name: feedback, dtype: object

In [18]:
# 4. Frequent word removal from the text; text which are not stopwords
# first we will take whole tweets data and split into words and then calculate their frequency
#  join words only with strings so, there needs to be some string to join other string
all_words = ' '.join( df['feedback'] ).split()
freq = pd.Series(all_words).value_counts()[:10]
freq

work          40202
great         26805
people        23197
good          22790
company       21379
none          18636
get           12625
management    11806
benefits      11400
employees     11068
dtype: int64

In [19]:
# 4a)the freq words might add some bias to our model and not contribute to our model 
# remove those common words which are  ouccring more than 10,000 times using "freq"
df['feedback'] = df['feedback'].apply( lambda x: " ".join( x for x in x.split() if x not in freq ) )
df.head(5)

Unnamed: 0,feedback,job_status,feedback_word_count,avg_word_len,sentiment_score,sentiment
0,early opinions working amazon exciting right t...,0,88,4.534091,0.124329,1
1,amazing friendly environment employee discount...,0,104,4.336538,0.448133,1
2,different huge responsibility fast impact righ...,0,151,4.516556,0.37381,1
3,better average sweat shop sharp cutting edge b...,0,153,4.895425,0.199087,1
4,highly recommended pay load advancement techno...,0,27,6.37037,0.48,1


In [20]:
# 4c) remove none word, becuase it was given when title was not present properly
df['feedback'] = df['feedback'].apply( lambda x: " ".join( x for x in x.split() if x not in ["none", "None"] ) )
df.head(5)


Unnamed: 0,feedback,job_status,feedback_word_count,avg_word_len,sentiment_score,sentiment
0,early opinions working amazon exciting right t...,0,88,4.534091,0.124329,1
1,amazing friendly environment employee discount...,0,104,4.336538,0.448133,1
2,different huge responsibility fast impact righ...,0,151,4.516556,0.37381,1
3,better average sweat shop sharp cutting edge b...,0,153,4.895425,0.199087,1
4,highly recommended pay load advancement techno...,0,27,6.37037,0.48,1


In [21]:
# 5. Remove Rare words, which will not contribute to our model
all_words = ' '.join( df['feedback'] ).split()
rarely = pd.Series(all_words).value_counts()[-55000:]
rarely.sort_values

<bound method Series.sort_values of time                      10270
place                      9514
working                    8904
job                        8854
pay                        8645
culture                    7745
environment                7647
like                       7364
team                       7252
amazon                     6994
lot                        6951
dont                       6774
managers                   6584
many                       6352
hours                      6314
really                     6143
balance                    6056
make                       5941
best                       5811
much                       5810
hard                       5638
apple                      5373
life                       5269
one                        5265
new                        5174
manager                    5092
opportunities              5059
experience                 4902
career                     4749
microsoft                  4612
    

In [25]:
df['feedback'].head(5)

0    early opinions working amazon exciting right t...
1    amazing friendly environment employee discount...
2    different huge responsibility fast impact righ...
3    better average sweat shop sharp cutting edge b...
4    highly recommended pay load advancement techno...
Name: feedback, dtype: object

In [23]:
# the abover words might not contribute to our model
# remove those rare words 
df['feedback1'] = df['feedback'].apply( lambda x: " ".join( x for x in x.split() if x not in rarely ) )


In [24]:
df['feedback1'].head(15)

0     
1     
2     
3     
4     
5     
6     
7     
8     
9     
10    
11    
12    
13    
14    
Name: feedback1, dtype: object

In [27]:
df.feedback.head(5)

0    early opinions working amazon exciting right t...
1    amazing friendly environment employee discount...
2    different huge responsibility fast impact righ...
3    better average sweat shop sharp cutting edge b...
4    highly recommended pay load advancement techno...
Name: feedback, dtype: object

In [28]:
# Check again for  rare words
all_words = ' '.join( df['feedback'] ).split()
rarely = pd.Series(all_words).value_counts()
rarely.sort_values

<bound method Series.sort_values of time                      10270
place                      9514
working                    8904
job                        8854
pay                        8645
culture                    7745
environment                7647
like                       7364
team                       7252
amazon                     6994
lot                        6951
dont                       6774
managers                   6584
many                       6352
hours                      6314
really                     6143
balance                    6056
make                       5941
best                       5811
much                       5810
hard                       5638
apple                      5373
life                       5269
one                        5265
new                        5174
manager                    5092
opportunities              5059
experience                 4902
career                     4749
microsoft                  4612
    

In [29]:
# 6. Spelling correction  -- not  necessary just need to remove the typos (rare words, which are accidently introduced)

# most common issue to deal with; spelling mistake, typos, shortcuts, abbreviatiosn (very common in when typing)
# spelling correction also help remove duplication that created by spelling mistakes like python, pythn, pythom etc.

# ** TextBlob is a Python library for processing textual data.
# Can be used for common NLP tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification,
# translation, and more.

# Spelling correction takes a lot of time. Also, we cannot always expect it to be accurate. 
# words are often used in their abbreviated form. For instance, ‘your’ is used as ‘ur’. We should treat this before the spelling
# correction step, otherwise these words might be transformed into any other word.

from textblob import TextBlob

# let's try spelling correction on 5 titles
# TextBlob takes one title and checks for each word for correction and corrects spellings and returns TextBlob; we can convert 
# into a string
#type(TextBlob("ysfunctional selfish drags kiss dysfu").correct())


# !!!!!!!!!!!!!!!!!!!!!!!!  Warning: This step is gonna take too much time maybe a few hours
#train['title'].apply( lambda x :  str( TextBlob(x).correct() ) )


In [30]:
# 7 . Lemmatization ; its preferred over stemming because if finds the root word
import nltk
nltk.download('wordnet')
from textblob import Word 



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Adarsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
# perform lemmatization on the title
df['feedback'] = df['feedback'].apply( lambda x : " ".join( [Word(word).lemmatize() for word in x.split()  ]) )
df['feedback'].head(5)

0    early opinion working amazon exciting right te...
1    amazing friendly environment employee discount...
2    different huge responsibility fast impact righ...
3    better average sweat shop sharp cutting edge b...
4    highly recommended pay load advancement techno...
Name: feedback, dtype: object

In [32]:
df.head(5)

Unnamed: 0,feedback,job_status,feedback_word_count,avg_word_len,sentiment_score,sentiment,feedback1
0,early opinion working amazon exciting right te...,0,88,4.534091,0.124329,1,
1,amazing friendly environment employee discount...,0,104,4.336538,0.448133,1,
2,different huge responsibility fast impact righ...,0,151,4.516556,0.37381,1,
3,better average sweat shop sharp cutting edge b...,0,153,4.895425,0.199087,1,
4,highly recommended pay load advancement techno...,0,27,6.37037,0.48,1,


In [33]:
# better save the file to avoid time consuming re-work on lemmatization
df.to_csv(r'C:\Users\Adarsh\uOttawa\ML Course Project-7Apr\datasets\4a-after-text-cleaning.csv', 
                    index=False)

In [34]:
# Now our data cleaning has almost been done. It's time to extract more features :
# 1. n-grams
# 2. tf-idf
# 3. bag of words

df = pd.read_csv(r'C:\Users\Adarsh\uOttawa\ML Course Project-7Apr\datasets\4a-after-text-cleaning.csv')

In [35]:
df.head(5)

Unnamed: 0,feedback,job_status,feedback_word_count,avg_word_len,sentiment_score,sentiment,feedback1
0,early opinion working amazon exciting right te...,0,88,4.534091,0.124329,1,
1,amazing friendly environment employee discount...,0,104,4.336538,0.448133,1,
2,different huge responsibility fast impact righ...,0,151,4.516556,0.37381,1,
3,better average sweat shop sharp cutting edge b...,0,153,4.895425,0.199087,1,
4,highly recommended pay load advancement techno...,0,27,6.37037,0.48,1,


In [30]:
# calculating tf-idf using scikitlearn
# sublinear_df  - is set to True to use a logarithmic form for frequency.
# min_df - is the minimum numbers of documents a word must be present in to be kept.
# norm - is set to l2, to ensure all our feature vectors have a euclidian norm of 1.
# ngram_range -  is set to (1, 2) to indicate that we want to consider both unigrams and bigrams.
# stop_words - is set to "english" to remove all common pronouns ("a", "the", ...) to reduce the number of noisy features.

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=50, norm='l2', encoding='latin-1', ngram_range=(1, 3))

features = tfidf.fit_transform(df.feedback.values.astype('str')).toarray()
labels = df.job_status

In [31]:

# create a dataframe to lookup which label value belongs to which collection?
#category_id_df = df[['Product', 'category_id']].drop_duplicates().sort_values('category_id')
job_status_df = df[['job_status']].drop_duplicates().sort_values('job_status')


category_ids = job_status_df.values
#id_to_category = dict(collection_labels_df[['collection', 'collection_labels']].values)

category_ids


array([[0],
       [1]], dtype=int64)

In [32]:
# let's find some interesting corelated unigrams and bigrams and trigrams
from sklearn.feature_selection import chi2
import numpy as np

N = 5

for status in sorted(category_ids):
    
    
    features_chi2 = chi2(features, labels == labels)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
    
    print("# '{}':".format(status))
    print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))
    print("  . Most correlated trigrams:\n       . {}".format('\n       . '.join(trigrams[-N:])))

# '[0]':
  . Most correlated unigrams:
       . nan
  . Most correlated bigrams:
       . 
  . Most correlated trigrams:
       . 
# '[1]':
  . Most correlated unigrams:
       . nan
  . Most correlated bigrams:
       . 
  . Most correlated trigrams:
       . 


In [33]:
# further data exploration
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(df.feedback.values.astype('str'))

data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df.index


In [34]:
data_dtm.columns

Index(['nan'], dtype='object')