In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import nltk

from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer



# Getting Data & Assigning Sentiment

In [2]:
review_df = pd.read_csv('../smp_data/ME_NH_RI_reviews.csv', encoding='utf-8',low_memory=False)

In [10]:
# get rid of any of the middle reviews, trying to balance positive and negative
revPosNeg = review_df.ix[(review_df['overall']<=2.0) | (review_df['overall'] == 5.0)]

In [11]:
# Including only certain departments (these have over 800 profs for both men & women in both sentiments)
revSet_df = revPosNeg[revPosNeg['department'].isin(['English','Mathematics','Biology','Psychology'])]

In [6]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

stopwords = nltk.corpus.stopwords.words('english')

# return content words and their stems
def content_stems(text, stopwords):
    tokens = nltk.word_tokenize(text)
    content = [word for word in tokens if word.lower() not in stopwords]
    stems = [stemmer.stem(t) for t in content]
    return [content,stems]

In [12]:
revSet_df['words'] = revSet_df['cleanText'].apply(lambda x: content_stems(x,stopwords)[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [15]:
# assign sentiment positive or negative
# need to do vectorized version of this
def sentiment(row):
    if row['overall'] == 5:
        val = 1
    elif row['overall'] <= 2.0:
        val = 0
    else:
        val = -1
    return val

In [16]:
# assign sentiment to all the reviews
revSet_df['sentiment'] = revSet_df.apply(sentiment,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [17]:
revSet_df = revSet_df[['profID','genderBest','department','sentiment','cleanText','words']]

# Test Sentiment Prediction

In [27]:
def simplePredict(myList):
if 'worst' in myList:
    return 0
elif 'best' in myList:
    return 1
elif 'good' in myList:
    return 1
else:
    return 0

IndentationError: expected an indented block (<ipython-input-27-ece42a00ae76>, line 2)

# Predict Sentiment from Text

In [18]:
# only use reviews where text is not "no comments"
revSet_df = revSet_df[revSet_df['cleanText']!="no comments "]

In [19]:
# pick out the train and the test sets
train_set = revSet_df

#pulling out the test set, can add in random_state=89 to be able to replicate results
test_set = train_set.sample(frac=.2)
train_set=train_set.drop(test_set.index)
# training to the sentiment
y_train = train_set['sentiment']
y_test = test_set['sentiment']


# Initialize TFIDF vectorizor object (no n-grams)
vectorizer = TfidfVectorizer( max_features = 40000, sublinear_tf = True )

# fit_transform() fits the model & and learns the vocab (features)
# then it transforms the training data into feature vectors. 
# The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(train_set['cleanText'])

# convert to array
train_data_features = train_data_features.toarray()

In [20]:
# vocab = feature names
vocab = vectorizer.get_feature_names()

In [22]:
# Initialize a Random Forest classifier with 100 trees
# oob_score gives "out of bag" errors
forest = RandomForestClassifier(n_estimators = 100,oob_score=True) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, y_train )

In [23]:
# how does the train set do on the out of bag reviews?
forest.oob_score_

0.92722632875747979

In [24]:
# cross-validation
# note need to update to latest sklearn which has this under sklearn.model_selection
from sklearn import cross_validation

In [25]:
# this cross validation is very costly
# may just want to go with the oob score (note that it is very close to cv score)
scores = cross_validation.cross_val_score(forest, train_data_features, y_train, cv=5)
scores

array([ 0.9023532 ,  0.90387154,  0.90737074,  0.8990099 ,  0.90649065])

In [26]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.90 (+/- 0.01)


In [30]:
# look at result on train data
trainResult = forest.predict(train_data_features)

In [31]:
# confusion matrix for train data
confusion_matrix(train_set["sentiment"], trainResult)

array([[ 8810,    17],
       [    0, 13901]])

# Model Output and Evaluation

In [32]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(test_set['cleanText'])
test_data_features = test_data_features.toarray()

In [33]:
# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "profID" & sentiment prediction
output = pd.DataFrame( data={'sentimentActual':test_set["sentiment"], 'gender':test_set['genderBest'],'depart':test_set['department'], "sentimentPredict":result} )

In [34]:
output.head()

Unnamed: 0,depart,gender,sentimentActual,sentimentPredict
64121,English,female,0,0
105316,Psychology,male,1,1
80222,Psychology,male,0,0
136767,English,female,0,0
131989,Mathematics,male,0,0


In [35]:
confusion_matrix(test_set["sentiment"], result)

array([[1904,  229],
       [ 119, 3430]])

In [36]:
# model evaluation
from sklearn.metrics import classification_report

In [37]:
print(classification_report(test_set['sentiment'],result))

             precision    recall  f1-score   support

          0       0.94      0.89      0.92      2133
          1       0.94      0.97      0.95      3549

avg / total       0.94      0.94      0.94      5682



# Gender Differences: Female Professors

In [550]:
# Female professors
femaleSet_df = revPosNeg[(revPosNeg['department'].isin(['English','Mathematics','Biology','Psychology']))&(revPosNeg['genderBest']=='female')]
femaleSet_df = revSet_df[['profID','genderBest','department','sentiment','cleanText','words']]

In [551]:
# only use reviews where text is not "no comments"
femaleSet_df = femaleSet_df[revSet_df['cleanText']!="no comments "]

In [552]:
# pick out the train and the test sets
train_set = femaleSet_df

#pulling out the test set
test_set = train_set.sample(frac=.2,random_state=2411)
train_set=train_set.drop(test_set.index)
# training to the sentiment
y_train = train_set['sentiment']
y_test = test_set['sentiment']


# Initialize TFIDF vectorizor object (no n-grams)
vectorizer = TfidfVectorizer( max_features = 40000, sublinear_tf = True )

# fit_transform() fits the model & and learns the vocab (features)
# then it transforms the training data into feature vectors. 
# The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(train_set['cleanText'])

# convert to array
train_data_features = train_data_features.toarray()

In [553]:
# Initialize a Random Forest classifier with 100 trees
# oob_score gives "out of bag" errors
forest = RandomForestClassifier(n_estimators = 100,oob_score=True) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, y_train )

In [554]:
# vocab = feature names
vocab = vectorizer.get_feature_names()

In [555]:
# how does the train set do on the out of bag reviews?
forest.oob_score_

0.92705033438929951

In [556]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(test_set['cleanText'])
test_data_features = test_data_features.toarray()

In [557]:
# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "profID" & sentiment prediction
output = pd.DataFrame( data={'sentimentActual':test_set["sentiment"], 'gender':test_set['genderBest'],'depart':test_set['department'], "sentimentPredict":result} )

In [558]:
femImportances = forest.feature_importances_

In [559]:
confusion_matrix(test_set["sentiment"], result)

array([[1950,  280],
       [ 108, 3344]])

In [560]:
femImp_df = pd.DataFrame({'vocab':vocab,'import':femImportances}).sort_values('import')

In [562]:
#imp_df = imp_df.reset_index()
femImp_df = femImp_df.reset_index()

In [527]:
impVocabBoth=pd.merge(imp_df[-100:],femImp_df[-100:],on='vocab',how='outer',suffixes=('_all','_fem'),indicator=True)

In [543]:
impVocabBoth

Unnamed: 0,level_0,index_all,import_all,vocab,index_fem,import_fem,_merge
0,17865.0,5201.0,0.002149,even,5181.0,0.002188,both
1,17866.0,2273.0,0.002150,cares,2273.0,0.002369,both
2,17867.0,15155.0,0.002157,students,15141.0,0.002213,both
3,17868.0,10810.0,0.002163,only,,,left_only
4,17869.0,6777.0,0.002203,guy,6755.0,0.002253,both
5,17870.0,16262.0,0.002224,too,16256.0,0.002107,both
6,17871.0,2716.0,0.002262,clear,2721.0,0.002156,both
7,17872.0,8454.0,0.002267,just,8401.0,0.002426,both
8,17873.0,9393.0,0.002288,makes,9366.0,0.002245,both
9,17874.0,1773.0,0.002304,book,1769.0,0.002266,both


# Male professors

In [528]:
# Male professors
maleSet_df = revPosNeg[(revPosNeg['department'].isin(['English','Mathematics','Biology','Psychology']))&(revPosNeg['genderBest']=='male')]
maleSet_df = revSet_df[['profID','genderBest','department','sentiment','cleanText','words']]

In [529]:
# only use reviews where text is not "no comments"
maleSet_df = maleSet_df[revSet_df['cleanText']!="no comments "]

In [530]:
# pick out the train and the test sets
train_set = maleSet_df

#pulling out the test set
test_set = train_set.sample(frac=.2,random_state=2411)
train_set=train_set.drop(test_set.index)
# training to the sentiment
y_train = train_set['sentiment']
y_test = test_set['sentiment']


# Initialize TFIDF vectorizor object (no n-grams)
vectorizer = TfidfVectorizer( max_features = 40000, sublinear_tf = True )

# fit_transform() fits the model & and learns the vocab (features)
# then it transforms the training data into feature vectors. 
# The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(train_set['cleanText'])

# convert to array
train_data_features = train_data_features.toarray()

In [531]:
# Initialize a Random Forest classifier with 100 trees
# oob_score gives "out of bag" errors
forest = RandomForestClassifier(n_estimators = 100,oob_score=True) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, y_train )

In [532]:
# vocab = feature names
vocab = vectorizer.get_feature_names()

In [533]:
# how does the train set do on the out of bag reviews?
forest.oob_score_

0.92559838085181279

In [534]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(test_set['cleanText'])
test_data_features = test_data_features.toarray()

In [535]:
# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "profID" & sentiment prediction
output = pd.DataFrame( data={'sentimentActual':test_set["sentiment"], 'gender':test_set['genderBest'],'depart':test_set['department'], "sentimentPredict":result} )

In [536]:
maleImportances = forest.feature_importances_

In [537]:
maleImp_df = pd.DataFrame({'vocab':vocab,'import':femImportances}).sort_values('import')

In [538]:
#imp_df = imp_df.reset_index()
maleImp_df = maleImp_df.reset_index()

In [542]:
maleImp_df[-50:]

Unnamed: 0,index,import,vocab
17910,17247,0.003319,waste
17911,7678,0.003396,in
17912,5191,0.003411,ever
17913,17431,0.003446,when
17914,1014,0.003454,at
17915,17519,0.003626,willing
17916,6487,0.003768,good
17917,10678,0.003884,of
17918,435,0.003945,all
17919,6992,0.003948,he


In [546]:
impVocabBoth.head()

Unnamed: 0,level_0,index_all,import_all,vocab,index_fem,import_fem,_merge
0,17865.0,5201.0,0.002149,even,5181.0,0.002188,both
1,17866.0,2273.0,0.00215,cares,2273.0,0.002369,both
2,17867.0,15155.0,0.002157,students,15141.0,0.002213,both
3,17868.0,10810.0,0.002163,only,,,left_only
4,17869.0,6777.0,0.002203,guy,6755.0,0.002253,both


In [563]:
impVocabMF=pd.merge(femImp_df[-100:],maleImp_df[-100:],on='vocab',how='outer',suffixes=('_fem','_male'),indicator=True)

In [567]:
impVocabMF['diff']=impVocabMF['import_fem']-impVocabMF['import_male']

In [582]:
impVocabMF = impVocabMF[['vocab','import_fem','import_male']].fillna(0)

In [587]:
impVocabMF['femRank']=impVocabMF['import_fem'].rank(ascending=0)
impVocabMF['maleRank']=impVocabMF['import_male'].rank(ascending=0)
impVocabMF['rankDiff']=impVocabMF['femRank']-impVocabMF['maleRank']

In [588]:
impVocabMF.sort_values('rankDiff')

Unnamed: 0,vocab,import_fem,import_male,femRank,maleRank,rankDiff
50,like,0.003423,0.002654,50.0,69.0,-19.0
36,or,0.002858,0.002441,64.0,79.0,-15.0
21,dr,0.002512,0.002196,79.0,93.0,-14.0
57,nothing,0.003832,0.003037,43.0,56.0,-13.0
9,yourself,0.002242,0.000000,91.0,103.5,-12.5
20,makes,0.002454,0.002245,80.0,91.0,-11.0
7,prof,0.002176,0.000000,93.0,103.5,-10.5
48,love,0.003103,0.002873,52.0,62.0,-10.0
87,this,0.007387,0.006219,13.0,23.0,-10.0
46,take,0.003061,0.002844,54.0,63.0,-9.0
