In [74]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, auc, accuracy_score
from scipy import sparse
import string

In [75]:
df = pd.read_csv("gender-classifier-DFE-791531.csv", encoding = 'latin-1')
df.head(5)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,gender,gender:confidence,profile_yn,profile_yn:confidence,created,...,profileimage,retweet_count,sidebar_color,text,tweet_coord,tweet_count,tweet_created,tweet_id,tweet_location,user_timezone
0,815719226,False,finalized,3,10/26/15 23:24,male,1.0,yes,1.0,12/5/13 1:48,...,https://pbs.twimg.com/profile_images/414342229...,0,FFFFFF,Robbie E Responds To Critics After Win Against...,,110964,10/26/15 12:40,6.5873e+17,main; @Kan1shk3,Chennai
1,815719227,False,finalized,3,10/26/15 23:30,male,1.0,yes,1.0,10/1/12 13:51,...,https://pbs.twimg.com/profile_images/539604221...,0,C0DEED,ÛÏIt felt like they were my friends and I was...,,7471,10/26/15 12:40,6.5873e+17,,Eastern Time (US & Canada)
2,815719228,False,finalized,3,10/26/15 23:33,male,0.6625,yes,1.0,11/28/14 11:30,...,https://pbs.twimg.com/profile_images/657330418...,1,C0DEED,i absolutely adore when louis starts the songs...,,5617,10/26/15 12:40,6.5873e+17,clcncl,Belgrade
3,815719229,False,finalized,3,10/26/15 23:10,male,1.0,yes,1.0,6/11/09 22:39,...,https://pbs.twimg.com/profile_images/259703936...,0,C0DEED,Hi @JordanSpieth - Looking at the url - do you...,,1693,10/26/15 12:40,6.5873e+17,"Palo Alto, CA",Pacific Time (US & Canada)
4,815719230,False,finalized,3,10/27/15 1:15,female,1.0,yes,1.0,4/16/14 13:23,...,https://pbs.twimg.com/profile_images/564094871...,0,0,Watching Neighbours on Sky+ catching up with t...,,31462,10/26/15 12:40,6.5873e+17,,


In [76]:
#keeping only the ones where we are 'mostly' confident it is male/female
df = df[df['gender:confidence'] > .8]
df.shape

(13939, 26)

In [77]:
#selecting only columns I want
cols = ['gender', 'text']
df = df[cols]
df.head()

Unnamed: 0,gender,text
0,male,Robbie E Responds To Critics After Win Against...
1,male,ÛÏIt felt like they were my friends and I was...
3,male,Hi @JordanSpieth - Looking at the url - do you...
4,female,Watching Neighbours on Sky+ catching up with t...
5,female,"Ive seen people on the train with lamps, chair..."


In [78]:
male = df[df['gender'] == 'male'] 
female = df[df['gender'] == 'female']
df = pd.concat([male, female])

In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10029 entries, 0 to 20049
Data columns (total 2 columns):
gender    10029 non-null object
text      10029 non-null object
dtypes: object(2)
memory usage: 235.1+ KB


In [80]:
#let's factorize gender
#male = 0, female = 1
df['gender'] = pd.factorize(df.gender)[0]
df.head()

Unnamed: 0,gender,text
0,0,Robbie E Responds To Critics After Win Against...
1,0,ÛÏIt felt like they were my friends and I was...
3,0,Hi @JordanSpieth - Looking at the url - do you...
7,0,Gala Bingo clubs bought for å£241m: The UK's l...
17,0,@coolyazzy94 Ditto - I'm still learning the fa...


### Text Cleaning

In [81]:
def count_exclamation(text):
    c = 0
    for char in text:
        if char == '!':
            c+=1
        else:
            pass
    return c

In [82]:
def get_hashtags(text, order=False):
    tags = set([item.strip("#.,-\"\'&*^!") for item in text.split() 
                if (item.startswith("#") and len(item) < 256)])
    return sorted(tags) if order else tags

In [83]:
#Remove emoticon after counting
def remove_hash(text_string):
    txt = []

    for w in text_string.split():
        if w.startswith("#"):
            t = w.replace("#","") #this will just remove the hashtag, not the tag itself
        else:

            t = w
        txt.append(t)
    text = ' '.join(txt)
    return text

In [84]:
#this will have to be used after emoticons
import re
def remove_handles_and_urls(text_string):
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z' \t])|(\w+:\/\/\S+)",
                           " ",text_string).split("\s"))
    return text

In [85]:
EMOTICON_LIST = [':-)', ':)', '(:', '(-:', ':P',':-D', ':D', 'X-D', 'XD', 'xD', '<3', ':*', ';-)', ';)', ';-D', ';D', '(;', '(-;', ':-(', ':(', '):', ')-:', ':,(', ":'(", ':"(', ':((']

In [86]:
#Count emoticons
def count_emoticon(text_string):
    counter = 0
    for w in text_string.split():
        if w in EMOTICON_LIST:
            counter+=1
        else:
            counter = counter
    return counter

In [87]:
#this will have to be used after emoticons
#import re
def count_mentions(text_string):
    mentions = []
    for i in text_string.split():
        if '@' in i:
            m = i
            mentions.append(m)
        else:
            pass
    return mentions

In [88]:
#Remove emoticon after counting
def remove_emoticon(text_string):
    txt = []
    for w in text_string.split():
        if w not in EMOTICON_LIST:
            t = w
        else:
            t = ""
        txt.append(t)
        text = ' '.join(txt)
    return text

In [89]:
def tok(s):
    return re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])').sub(r' \1 ',s).split()

In [90]:
def tokens(x):
    return x.split()

In [91]:
data = []
for i in range(len(df)):
    hashtags = get_hashtags(df.iloc[i]['text'])
    num_exclamations = count_exclamation(df.iloc[i]['text'])
    num_emoticons = count_emoticon(df.iloc[i]['text'])
    num_mentions = len(count_mentions(df.iloc[i]['text']))
    text1 = remove_emoticon(df.iloc[i]['text'])
    text2 = remove_handles_and_urls(text1)
    text3 = remove_hash(text2)
    num_hash = len(hashtags)
    rows = (i, text3, num_emoticons, num_hash, num_mentions, df.iloc[i]['gender'])
    data.append(rows)

In [92]:
transformed_df = pd.DataFrame(data,  columns=['comment_index', 'comment', "num_emoji", "num_hashtags", "num_mentions", 'gender'])
transformed_df.tail(5)

Unnamed: 0,comment_index,comment,num_emoji,num_hashtags,num_mentions,gender
10024,10024,Especially when all you do is the best you can...,0,0,0,1
10025,10025,Need A Ride Home From Practice And its Raining...,0,0,0,1
10026,10026,Fine and I'll drink tea too I love you,0,0,1,1
10027,10027,i had noticed your tendency to pee on the carp...,1,0,1,1
10028,10028,I think for my APUSH creative project I'm goin...,0,0,0,1


In [93]:
transformed_df.drop('comment_index', inplace = True, axis =1)
transformed_df.head()

Unnamed: 0,comment,num_emoji,num_hashtags,num_mentions,gender
0,Robbie E Responds To Critics After Win Against...,0,1,0,0
1,It felt like they were my friends and I was li...,0,2,0,0
2,Hi Looking at the url do you use Don't typical...,0,0,3,0
3,Gala Bingo clubs bought for 241m The UK's larg...,0,0,0,0
4,Ditto I'm still learning the favourites and re...,1,0,1,0


In [94]:
import nltk
from nltk.corpus import stopwords

def review_to_words(text):
    lower = text.lower()
    words = lower.split()
    words_only = [w for w in words ]
    
    stopWords = set(stopwords.words('english'))
    
    meaningful_words = [w for w in words if not w in stopWords]
    filtered_words = [i for i in meaningful_words if not i.isdigit()]
    
    return " ".join(meaningful_words)



##  All Features

In [95]:
##tfidf_vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
words = TfidfVectorizer(ngram_range=(1,3), lowercase=True, 
                        analyzer='word', stop_words='english', tokenizer=tokens,
                        min_df=3,max_df=0.9, sublinear_tf=1, smooth_idf=1, use_idf=1,
                        strip_accents='unicode', max_features=5000, decode_error='replace')

In [96]:
X1 = words.fit_transform(transformed_df['comment'])

In [97]:
X2 = sparse.csr_matrix(np.array([transformed_df['num_emoji']]).reshape(-1,1))

In [98]:
X3 = sparse.csr_matrix(np.array([transformed_df['num_hashtags']]).reshape(-1,1))

In [99]:
X4 = sparse.csr_matrix(np.array([transformed_df['num_mentions']]).reshape(-1,1))

In [100]:
X_c = sparse.hstack([X1, X2, X3, X4])

In [101]:
y = transformed_df['gender']

## Splitting to test_train class

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state=101)

In [103]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=1, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=1, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [104]:
result = clf.predict(X_test)

In [105]:
output = pd.DataFrame( data={"actual_y":y_test, "predicted_y":result} )

In [106]:
from sklearn.metrics import classification_report
y_true = y_test
y_pred = result
target_names = ['male', 'female']
print(classification_report(y_true, y_pred, target_names=target_names))

             precision    recall  f1-score   support

       male       0.00      0.00      0.00      1172
     female       0.53      1.00      0.69      1336

avg / total       0.28      0.53      0.37      2508



In [107]:
roc_auc_score(result,y_test)

0.2662544874351815

In [108]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)

0.5322966507177034

### Naive Bayes

In [109]:
NB_clf = MultinomialNB().fit(X_train, y_train)
result = NB_clf.predict(X_test)
output = pd.DataFrame( data={"actual_y":y_test, "predicted_y":result} )

In [110]:
y_true = y_test
y_pred = result
target_names = ['male', 'female']
print(classification_report(y_true, y_pred, target_names=target_names))
roc_auc_score(result,y_test)

             precision    recall  f1-score   support

       male       0.60      0.50      0.55      1172
     female       0.62      0.71      0.66      1336

avg / total       0.61      0.61      0.61      2508



0.6089257789005172

In [111]:
accuracy_score(y_true, y_pred)

0.6108452950558214

## SVM Classifier

In [112]:
from sklearn import svm

In [113]:
clf = svm.SVC()
clf.fit(X_train, y_train)
result = clf.predict(X_test)
output = pd.DataFrame( data={"actual_y":y_test, "predicted_y":result} )

In [114]:
y_true = y_test
y_pred = result
target_names = ['male', 'female']
print(classification_report(y_true, y_pred, target_names=target_names))
roc_auc_score(result,y_test)

             precision    recall  f1-score   support

       male       0.00      0.00      0.00      1172
     female       0.53      1.00      0.69      1336

avg / total       0.28      0.53      0.37      2508



0.2662544874351815

In [115]:
accuracy_score(y_true, y_pred)

0.5322966507177034

## Text Only

### Naive Bayes (Sklearn)

In [116]:
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=101)
NB_clf = MultinomialNB().fit(X_train, y_train)
result = NB_clf.predict(X_test)
output = pd.DataFrame( data={"actual_y":y_test, "predicted_y":result} )

In [117]:
y_true = y_test
y_pred = result
target_names = ['male', 'female']
print(classification_report(y_true, y_pred, target_names=target_names))
roc_auc_score(result,y_test)

             precision    recall  f1-score   support

       male       0.60      0.46      0.52      1172
     female       0.61      0.73      0.66      1336

avg / total       0.60      0.60      0.60      2508



0.6032079573150649

In [118]:
accuracy_score(y_true, y_pred)

0.604066985645933

In [119]:
print("predited female: "+str(sum(result)), 
      "number of females "+str(sum(y_test)), 
      "number of observations in test class: "+str(len(y_test)), 
       sep="\n")

predited female: 1615
number of females 1336
number of observations in test class: 2508


### Logistic Regression

In [120]:
from sklearn import linear_model
LogR_clf = linear_model.LogisticRegression(C=1e5)
LogR_clf.fit(X_train, y_train)
result = LogR_clf.predict(X_test)
output = pd.DataFrame( data={"actual_y":y_test, "predicted_y":result} )

In [121]:
y_true = y_test
y_pred = result
target_names = ['male', 'female']
print(classification_report(y_true, y_pred, target_names=target_names))
roc_auc_score(result,y_test)

             precision    recall  f1-score   support

       male       0.53      0.53      0.53      1172
     female       0.59      0.59      0.59      1336

avg / total       0.56      0.56      0.56      2508



0.5592204582318123

In [122]:
accuracy_score(y_true, y_pred)

0.5610047846889952

In [123]:
print("predited female: "+str(sum(result)), 
      "number of females "+str(sum(y_test)), 
      "number of observations in test class: "+str(len(y_test)), 
       sep="\n")

predited female: 1331
number of females 1336
number of observations in test class: 2508


### Random Forest

In [124]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
result = clf.predict(X_test)
output=pd.DataFrame( data={"actual_y":y_test, "predicted_y":result} )

In [125]:
y_true = y_test
y_pred = result
target_names = ['male', 'female']
print(classification_report(y_true, y_pred, target_names=target_names))

             precision    recall  f1-score   support

       male       0.87      0.02      0.04      1172
     female       0.54      1.00      0.70      1336

avg / total       0.69      0.54      0.39      2508



In [126]:
roc_auc_score(result,y_test)

0.7020984665052461

In [127]:
accuracy_score(y_true, y_pred)

0.5414673046251993

In [128]:
print("predicted female: "+str(sum(result)), 
      "number of females "+str(sum(y_test)), 
      "number of observations in test class: "+str(len(y_test)), 
       sep="\n")

predicted female: 2478
number of females 1336
number of observations in test class: 2508


### Naive Bayes

In [129]:
to_drop = ['num_emoji', 'num_hashtags', 'num_mentions']
transformed_df.drop(to_drop, axis = 1, inplace = True)
transformed_df.head()

Unnamed: 0,comment,gender
0,Robbie E Responds To Critics After Win Against...,0
1,It felt like they were my friends and I was li...,0
2,Hi Looking at the url do you use Don't typical...,0
3,Gala Bingo clubs bought for 241m The UK's larg...,0
4,Ditto I'm still learning the favourites and re...,0


In [130]:
all_words = []
for i in transformed_df['comment']:
    for j in i.split():
        all_words.append(''.join(j.lower()))
    
word_freq = nltk.FreqDist(all_words)
word_features = list(word_freq.keys())

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [131]:
#turning df into tuples
data = []
for i in range(len(transformed_df)):
    words = transformed_df.iloc[i]['comment'].split()
    data.append((words, transformed_df.iloc[i]['gender']))

In [132]:
featuresets = [(find_features(comment), gender) for (comment, gender) in data]

In [133]:
len(featuresets)

10029

### Train Test Split

In [134]:
X_train, X_test, y_train, y_test = train_test_split(transformed_df['comment'], transformed_df['gender'])

In [135]:
print(len(X_train), len(y_train))

7521 7521


In [136]:
train_df = pd.DataFrame({'X': X_train, 'y': y_train})

train = []
for i in range(len(train_df)):
    words = train_df.iloc[i]['X'].split()
    train.append((words, train_df.iloc[i]['y']))


In [137]:
test_df = pd.DataFrame({'X': X_test, 'y': y_test})

test = []
for i in range(len(test_df)):
    words = test_df.iloc[i]['X'].split()
    test.append((words, test_df.iloc[i]['y']))


In [138]:
train_features = [(find_features(comment), gender) for (comment, gender) in train]
test_features = [(find_features(comment), gender) for (comment, gender) in test]

### Training Naive Bayes

In [139]:
classifier = nltk.NaiveBayesClassifier.train(train_features)

In [140]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, test_features))*100)

Classifier accuracy percent: 60.16746411483253


In [141]:
classifier.show_most_informative_features(15)

Most Informative Features
                     bro = True                0 : 1      =      9.7 : 1.0
                    ball = True                0 : 1      =      8.9 : 1.0
                  sister = True                1 : 0      =      8.3 : 1.0
                   below = True                0 : 1      =      8.2 : 1.0
                 outside = True                1 : 0      =      7.7 : 1.0
                    shut = True                0 : 1      =      7.4 : 1.0
                      vs = True                0 : 1      =      7.4 : 1.0
                 defense = True                0 : 1      =      7.4 : 1.0
                  crying = True                1 : 0      =      7.1 : 1.0
                  motion = True                0 : 1      =      6.6 : 1.0
                 podcast = True                0 : 1      =      6.6 : 1.0
                princess = True                1 : 0      =      6.6 : 1.0
                    team = True                0 : 1      =      6.4 : 1.0

### Logistic Regression

In [142]:
from nltk.classify.scikitlearn import SklearnClassifier
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(train_features)
accuracy = nltk.classify.accuracy(LogisticRegression_classifier, test_features)*100
print("Logistic Regression classifier accuracy =", accuracy)

Logistic Regression classifier accuracy = 60.007974481658685
