In [50]:
import pandas as pd
import numpy as np 
import re
import os
import json
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import pylab as pl 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

## read data

In [5]:
data_path = 'data'

In [6]:
labels_train = pd.read_csv(os.path.join(data_path, 'labels_train.csv'), low_memory=False)
labels_test = pd.read_csv(os.path.join(data_path, 'labels_test.csv'), low_memory=False)
print(labels_train)
print(labels_test)

      person_id  gender    generation occupation
0             0    male   Millennials     sports
1             1    male   Millennials     sports
2             2  female  Generation X  performer
3             3    male   Millennials     sports
4             4    male  Generation X     sports
...         ...     ...           ...        ...
4668       4668    male   Millennials     sports
4669       4669    male   Millennials    creator
4670       4670    male       Boomers   politics
4671       4671  female       Boomers     sports
4672       4672    male  Generation X   politics

[4673 rows x 4 columns]
     person_id  gender    generation occupation
0         4673    male   Millennials     sports
1         4674    male  Generation X     sports
2         4675    male   Millennials    creator
3         4676    male  Generation X  performer
4         4677    male  Generation X    creator
..         ...     ...           ...        ...
515       5188    male   Millennials     sports
516

In [7]:
tweets_train = pd.read_csv(os.path.join(data_path, 'tweets_train.csv'), low_memory=False)
tweets_test = pd.read_csv(os.path.join(data_path, 'tweets_test.csv'), low_memory=False)

In [8]:
print(tweets_train)
print(tweets_test)

        person_id                                              tweet
0               0                     Legend https://t.co/cgkeYFI92H
1               0  @NHLFlames @bigern10 @calstampeders cannot wai...
2               0  RT @Fan960Wills: Help @BBBSCalgary by texting ...
3               0  @RMHSouthernAB @NHLFlames thank you Derek. Ver...
4               0  RT @RMHSouthernAB: Inspired by his @NHLFlames ...
...           ...                                                ...
2351794      4672                                @MarioAsselin merci
2351795      4672  @MarioAsselin @ramezayoub est ce possible d'av...
2351796      4672  @Casup007 Allo Carole as tu un parapluie en pl...
2351797      4672  @MarioAsselin @ramezayoub: Bonjour M Asselin d...
2351798      4672  Bon lundi à tous je commence par un tweet ce m...

[2351799 rows x 2 columns]
        person_id                                              tweet
0            4673  So proud #RollTide #VictoryWednesday #ThankYou...
1     

In [9]:
tweets_train

Unnamed: 0,person_id,tweet
0,0,Legend https://t.co/cgkeYFI92H
1,0,@NHLFlames @bigern10 @calstampeders cannot wai...
2,0,RT @Fan960Wills: Help @BBBSCalgary by texting ...
3,0,@RMHSouthernAB @NHLFlames thank you Derek. Ver...
4,0,RT @RMHSouthernAB: Inspired by his @NHLFlames ...
...,...,...
2351794,4672,@MarioAsselin merci
2351795,4672,@MarioAsselin @ramezayoub est ce possible d'av...
2351796,4672,@Casup007 Allo Carole as tu un parapluie en pl...
2351797,4672,@MarioAsselin @ramezayoub: Bonjour M Asselin d...


In [10]:
len(tweets_train)

2351799

In [11]:
tweets_train.isnull().sum()

person_id    0
tweet        2
dtype: int64

In [12]:
tweets_train.dropna(inplace=True)

In [13]:
len(tweets_train)

2351797

In [14]:
# Säubern der Tweets
stemmer = SnowballStemmer('english')
stopwords_E = stopwords.words('english')

def cleaning_tweet(tweet):
    tweet = tweet.lower()
    #tweet = re.sub('@[^\s]+', '', tweet) # entfernt usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # entfernt #hashtag
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', tweet) # entfernt URL
    tweet_token = word_tokenize(tweet)
    stemmedword = [stemmer.stem(w) for w in tweet_token] 
    no_stopword = [w for w in stemmedword if w.isalpha() and w not in stopwords_E ]   
    words = [w for w in no_stopword if len(w) > 1]
    return words

In [15]:
# laden und säubern X_train
grouped_persons = tweets_train.groupby('person_id')
grouped_tweets = grouped_persons['tweet'].agg(lambda column: " ".join(column))
grouped_tweets

person_id
0       Legend https://t.co/cgkeYFI92H @NHLFlames @big...
1       RT @MisterRoast98: It's amazing what a simple ...
10      RT @ARNelson7: Had a great time time in Richmo...
100     Very sad to hear of Paul Allen’s passing. His ...
1000    Thanks for leaving the shanks out 😂 https://t....
                              ...                        
995     Kæru konur, leggjum niður launuð og ólaunuð st...
996     RT @mendy_bernard: Une pensée pour toutes les ...
997     Fight night 4 years ago. Good stuff. I miss it...
998     RT @PWEverestRugby: Here is the link to my #Ev...
999     Who else is seeing @thegreatwall this weekend?...
Name: tweet, Length: 4673, dtype: object

In [16]:
cleaned_tweets_data = grouped_tweets.apply(lambda row : cleaning_tweet(row)) # ca. 10 minuten wartezeit

In [18]:
cleaned_tweets_data

person_id
0       [legend, nhlflame, calstamped, wait, see, foot...
1       [rt, amaz, simpl, group, chat, veri, import, y...
10      [rt, great, time, time, richmond, weekend, rea...
100     [veri, sad, hear, paul, allen, pass, passion, ...
1000    [thank, leav, shank, hbd, man, gis, kiss, carl...
                              ...                        
995     [kæru, konur, leggjum, niður, launuð, og, ólau...
996     [rt, une, pensé, pour, tout, les, personn, qui...
997     [fight, night, year, ago, good, stuff, miss, r...
998     [rt, pweverestrugbi, link, everestrugbi, justg...
999     [els, see, thegreatwal, weekend, excit, watch,...
Name: tweet, Length: 4673, dtype: object

In [19]:
cleaned_tweets = cleaned_tweets_data.reset_index()
cleaned_tweets = cleaned_tweets.drop(columns=['person_id']) 
#cleaned_tweets = cleaned_tweets.join(labels_train['person_id']) # neu frisch hinzufügen person_id
#cleaned_tweets = cleaned_tweets[['person_id','tweet']] # neu column anordnen

In [20]:
X_train = cleaned_tweets
X_train = cleaned_tweets['tweet']
X_train = X_train.apply(lambda row : ' '.join(row)) 

print(X_train)

0       legend nhlflame calstamped wait see footag woo...
1       rt amaz simpl group chat veri import youth hea...
2       rt great time time richmond weekend readingfig...
3       veri sad hear paul allen pass passion invent p...
4       thank leav shank hbd man gis kiss carlo ricki ...
                              ...                        
4668    kæru konur leggjum niður launuð og ólaunuð stö...
4669    rt une pensé pour tout les personn qui nous on...
4670    fight night year ago good stuff miss rt ufc go...
4671    rt pweverestrugbi link everestrugbi justgiv pa...
4672    els see thegreatwal weekend excit watch matt d...
Name: tweet, Length: 4673, dtype: object


In [21]:
# laden und säubern X_test
grouped_persons_test = tweets_test.groupby('person_id')
grouped_tweets_test  = grouped_persons_test['tweet'].agg(lambda column: " ".join(column))
grouped_tweets_test

person_id
4673    So proud #RollTide #VictoryWednesday #ThankYou...
4674    RT @LSUFBrecruiting: #NFLSU: Week 7 Top Perfor...
4675    Remember that the one time Trump owned a profe...
4676    :) :) https://t.co/XTmHQagQLI Doston miliye me...
4677    RT @shinnox: Every #mk champions breakfast of ...
                              ...                        
5188    Any 2nd Rows interested in coming to Hong Kong...
5189    @nojokemoke Yes Sr!!!! Let’s get it @bschoon86...
5190    @HildaEsq @EMorrisonSmith @RiseSanDiego @Karee...
5191    ‘Divorce’ Renewed for Season 3 at HBO With New...
5192    RT @DurstJarmarquis: It’s truly a blessing to ...
Name: tweet, Length: 520, dtype: object

In [22]:
grouped_tweets_test_data =  grouped_tweets_test.apply(lambda row : cleaning_tweet(row)) # ca. 2 minuten wartezeit

In [23]:
grouped_tweets_test_data.head()

person_id
4673    [proud, rolltid, victorywednesday, thankyouala...
4674    [rt, lsufbrecruit, nflsu, week, top, perform, ...
4675    [rememb, one, time, trump, profession, sport, ...
4676    [doston, miliy, mere, nay, jodidaar, se, br, h...
4677    [rt, shinnox, everi, mk, champion, breakfast, ...
Name: tweet, dtype: object

In [24]:
grouped_tweets_test = grouped_tweets_test_data.reset_index()
grouped_tweets_test = grouped_tweets_test.drop(columns=['person_id']) 
grouped_tweets_test

Unnamed: 0,tweet
0,"[proud, rolltid, victorywednesday, thankyouala..."
1,"[rt, lsufbrecruit, nflsu, week, top, perform, ..."
2,"[rememb, one, time, trump, profession, sport, ..."
3,"[doston, miliy, mere, nay, jodidaar, se, br, h..."
4,"[rt, shinnox, everi, mk, champion, breakfast, ..."
...,...
515,"[ani, row, interest, come, hong, kong, play, m..."
516,"[nojokemok, yes, sr, let, get, great, wish, co..."
517,"[hildaesq, emorrisonsmith, risesandiego, wish,..."
518,"[divorc, renew, season, hbo, new, showrunn, va..."


In [25]:
X_test = grouped_tweets_test['tweet'] #cleaned_tweets_username / cleaned_tweets
X_test = X_test.apply(lambda row : ' '.join(row)) 
X_test

0      proud rolltid victorywednesday thankyoualabama...
1      rt lsufbrecruit nflsu week top perform rt lsuf...
2      rememb one time trump profession sport team in...
3      doston miliy mere nay jodidaar se br humn kasa...
4      rt shinnox everi mk champion breakfast choic n...
                             ...                        
515    ani row interest come hong kong play month con...
516    nojokemok yes sr let get great wish could got ...
517    hildaesq emorrisonsmith risesandiego wish coul...
518    divorc renew season hbo new showrunn varieti a...
519    rt durstjarmarqui truli bless onli part texasf...
Name: tweet, Length: 520, dtype: object

## train classifierers and predictions

In [78]:
def show_res(y_test, predictions):
    print("Confusin Matrix:")
    print(metrics.confusion_matrix(y_test,predictions))
    print("Classification Report:")
    print(metrics.classification_report(y_test,predictions))
    print("Accuracy Score:")
    print(metrics.accuracy_score(y_test,predictions))

def train_data_naive_bayes(categorie):
    print("naive_bayes")
    
    y_train = labels_train[categorie]
    y_test = labels_test[categorie]
    
    text_clf_nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
    
    text_clf_nb.fit(X_train, y_train) 
    
    predictions = text_clf_nb.predict(X_test)
   
    show_res(y_test, predictions)


def train_data_linear_svc(categorie):
    print("linear_svc")
    
    y_train = labels_train[categorie]
    y_test = labels_test[categorie]
    
    text_clf_lsvc = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LinearSVC()),
              ])
    
    text_clf_lsvc.fit(X_train, y_train)
    
    predictions = text_clf_lsvc.predict(X_test)
    show_res(y_test, predictions)


def train_data_SGDClassifier(categorie):
    print("SGDClassifier")
        
    y_train = labels_train[categorie]
    y_test = labels_test[categorie]
    
    text_clf_lsvc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier()), 
               ])
    # loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None
    
    text_clf_lsvc.fit(X_train, y_train)

    predictions = text_clf_lsvc.predict(X_test)
    show_res(y_test, predictions) 

def train_data_LogistigRegression(categorie):
    print("LogistigRegression")
    
    y_train = labels_train[categorie]
    y_test = labels_test[categorie]
    
    text_clf_lsvc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)),
               ])
    
    text_clf_lsvc.fit(X_train, y_train)

    predictions = text_clf_lsvc.predict(X_test)
    show_res(y_test, predictions) 

def train_data_DecisionTreeClassifier(categorie):
    print("DecisionTreeClassifier")

    y_train = labels_train[categorie]
    y_test = labels_test[categorie]
    
    clf_generation = DecisionTreeClassifier()
    clf_generation.fit(X_train, y_train)

    predictions = text_clf_lsvc.predict(X_test)
    show_res(y_test, predictions)


In [79]:
train_data_naive_bayes('gender')


naive_bayes
Confusin Matrix:
[[  0 100]
 [  0 420]]
Classification Report:
              precision    recall  f1-score   support

      female       0.00      0.00      0.00       100
        male       0.81      1.00      0.89       420

    accuracy                           0.81       520
   macro avg       0.40      0.50      0.45       520
weighted avg       0.65      0.81      0.72       520

Accuracy Score:
0.8076923076923077


  _warn_prf(average, modifier, msg_start, len(result))


In [80]:
train_data_SGDClassifier('gender')

SGDClassifier
Confusin Matrix:
[[  6  94]
 [  8 412]]
Classification Report:
              precision    recall  f1-score   support

      female       0.43      0.06      0.11       100
        male       0.81      0.98      0.89       420

    accuracy                           0.80       520
   macro avg       0.62      0.52      0.50       520
weighted avg       0.74      0.80      0.74       520

Accuracy Score:
0.8038461538461539


In [81]:
train_data_linear_svc('gender')


linear_svc
Confusin Matrix:
[[  6  94]
 [  4 416]]
Classification Report:
              precision    recall  f1-score   support

      female       0.60      0.06      0.11       100
        male       0.82      0.99      0.89       420

    accuracy                           0.81       520
   macro avg       0.71      0.53      0.50       520
weighted avg       0.77      0.81      0.74       520

Accuracy Score:
0.8115384615384615


In [82]:
train_data_LogistigRegression('gender')

LogistigRegression
[LibLinear]Confusin Matrix:
[[  4  96]
 [  1 419]]
Classification Report:
              precision    recall  f1-score   support

      female       0.80      0.04      0.08       100
        male       0.81      1.00      0.90       420

    accuracy                           0.81       520
   macro avg       0.81      0.52      0.49       520
weighted avg       0.81      0.81      0.74       520

Accuracy Score:
0.8134615384615385


In [83]:
train_data_naive_bayes('generation')


naive_bayes
Confusin Matrix:
[[  0   0   0  78   0]
 [  0   0   0 108   0]
 [  0   0   0  31   0]
 [  0   0   0 290   0]
 [  0   0   0  13   0]]
Classification Report:
              precision    recall  f1-score   support

     Boomers       0.00      0.00      0.00        78
Generation X       0.00      0.00      0.00       108
Generation Z       0.00      0.00      0.00        31
 Millennials       0.56      1.00      0.72       290
      Silent       0.00      0.00      0.00        13

    accuracy                           0.56       520
   macro avg       0.11      0.20      0.14       520
weighted avg       0.31      0.56      0.40       520

Accuracy Score:
0.5576923076923077


  _warn_prf(average, modifier, msg_start, len(result))


In [84]:
train_data_SGDClassifier('generation')


SGDClassifier
Confusin Matrix:
[[  4   2   1  71   0]
 [  6   5   1  96   0]
 [  0   2   0  29   0]
 [ 11  21   3 255   0]
 [  1   2   0  10   0]]
Classification Report:
              precision    recall  f1-score   support

     Boomers       0.18      0.05      0.08        78
Generation X       0.16      0.05      0.07       108
Generation Z       0.00      0.00      0.00        31
 Millennials       0.55      0.88      0.68       290
      Silent       0.00      0.00      0.00        13

    accuracy                           0.51       520
   macro avg       0.18      0.20      0.17       520
weighted avg       0.37      0.51      0.41       520

Accuracy Score:
0.5076923076923077


In [85]:
train_data_linear_svc('generation')


linear_svc
Confusin Matrix:
[[  3   1   0  74   0]
 [  4   3   1 100   0]
 [  0   2   0  29   0]
 [  7  15   1 267   0]
 [  1   1   0  11   0]]
Classification Report:
              precision    recall  f1-score   support

     Boomers       0.20      0.04      0.06        78
Generation X       0.14      0.03      0.05       108
Generation Z       0.00      0.00      0.00        31
 Millennials       0.56      0.92      0.69       290
      Silent       0.00      0.00      0.00        13

    accuracy                           0.53       520
   macro avg       0.18      0.20      0.16       520
weighted avg       0.37      0.53      0.41       520

Accuracy Score:
0.525


In [86]:
train_data_LogistigRegression('generation')


LogistigRegression
[LibLinear]Confusin Matrix:
[[  2   1   0  75   0]
 [  1   0   0 107   0]
 [  0   0   0  31   0]
 [  2   7   0 281   0]
 [  0   0   0  13   0]]
Classification Report:
              precision    recall  f1-score   support

     Boomers       0.40      0.03      0.05        78
Generation X       0.00      0.00      0.00       108
Generation Z       0.00      0.00      0.00        31
 Millennials       0.55      0.97      0.71       290
      Silent       0.00      0.00      0.00        13

    accuracy                           0.54       520
   macro avg       0.19      0.20      0.15       520
weighted avg       0.37      0.54      0.40       520

Accuracy Score:
0.5442307692307692


In [None]:
train_data_naive_bayes('occupation')


In [None]:
train_data_SGDClassifier('occupation')


In [None]:
train_data_linear_svc('occupation')


In [None]:
train_data_LogistigRegression('occupation')
