In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer#, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import sys
LIBPATH = r'D:\Springboard_DataSci\Assignments\Lib'
if LIBPATH not in sys.path:
    sys.path.insert(0, LIBPATH)
import TimeTracker

TYPE = 'Type'
LOGISTIC = 'logistic'
NAIVE_BAYES = 'multinomial naive Bayes'
RAND_FOREST = 'random forest'

# Get all the tweets.
path = r'D:\Springboard_DataSci\Twitter_MBTI_predictor\Data Output'
os.chdir(path)

In [2]:
letters = [['E', 'I'], ['S', 'N'], ['F', 'T'], ['J', 'P']]
MB_types = []
# Get the list of types using binary math.
for i in range(16):
    MB_types.append(letters[0][i//8%2] + letters[1][i//4%2]
                      + letters[2][i//2%2] + letters[3][i%2])

In [3]:
def load_tweets(MB_type):
    return pd.read_csv(
        path + '\\' + MB_type + '_tweets.csv', parse_dates=[2],
        infer_datetime_format=True)

In [4]:
# Load tweets
print('Loading tweets:', end=' ')
for i, MB_type in enumerate(MB_types):
    print(f'{MB_type}', end=' ')
    if i == 0:
        tweets = load_tweets(MB_type)
    else:
        tweets = tweets.append(load_tweets(MB_type))

Loading tweets: ESFJ ESFP ESTJ ESTP ENFJ ENFP ENTJ ENTP ISFJ ISFP ISTJ ISTP INFJ INFP INTJ INTP 

In [5]:
# Classify their type
for i, letter in enumerate('ESFJ'):
    tweets[letter] = tweets['MBTI'].str[i] == letter

We're going to pick up a lot of junk if we don't trim out tags and hashtags.
Let's do that now.

In [6]:
def trim_tweet(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)",
                           " ", tweet).split())

In [7]:
print('\nTrimming tweets of tags and URLs')
tweets['Tweet'] = tweets['Tweet'].apply(trim_tweet)


Trimming tweets of tags and URLs


Now let's attempt to classify.

In [8]:
def analyze_tweets(tweets, letter, classifier, min_df=200, max_df=1.,
                   alpha=1, C=1, max_depth=None, n_estimators=100,
                   stop_words=None, get_words_and_probas=False, test_size=0.25,
                   max_iter=1e3):
    '''Text classification of the tweets'''
    y = tweets[letter]
    vectorizer = CountVectorizer(min_df=min_df, max_df=max_df,
                                 stop_words=stop_words)
    tweets = tweets['Tweet'].to_list()
    # Get the sparse matrix (x, y) of (tweetID, wordID).
    X = vectorizer.fit_transform(tweets)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=0)
    
    if classifier==LOGISTIC:
        clf = LogisticRegression(C=C, max_iter=max_iter, random_state=0)\
            .fit(X_train, y_train)
    elif classifier==NAIVE_BAYES:
        clf = MultinomialNB(alpha=alpha).fit(X_train, y_train)
    elif classifier==RAND_FOREST:
        clf = RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth, random_state=0)\
            .fit(X_train, y_train)
    else:
        raise ValueError('Unrecognized classifier "' + classifier + '"')
        
    if get_words_and_probas:
        x = np.eye(X_test.shape[1])
        words_all = np.array(vectorizer.get_feature_names())
        probs = clf.predict_log_proba(x)[:, 0]
    else:
        words_all = probs = None
    return clf.score(X_train, y_train), clf.score(X_test, y_test), words_all,\
        probs

In [9]:
print('Grouping tweets by author')
tweets_per_author = tweets.copy()
tweets_per_author['Tweet'] = tweets_per_author['Tweet']\
    .apply(lambda x: x + ' ')
tweets_per_author = tweets_per_author.groupby(
    tweets_per_author['Screen name'])['Tweet'].apply(lambda x: x.sum())\
    .reset_index()
# This threw away the MBTI info, but we can get it back.
authors_MBTI = tweets[['Screen name', 'E', 'S', 'F', 'J']].drop_duplicates()
tweets_per_author = tweets_per_author.merge(
    authors_MBTI, 'left', on='Screen name')

Grouping tweets by author


We've looked through several combinations of hyperparameters. Let's look
for the one that performs the best on each axis. We search by axis (letter),
min_df, classifier, and hyperparameter. We look for the best test score per
run and record it. This testing includes random forests, which we have not
looked at yet.

In [10]:
stopwatch = TimeTracker.TimeTracker() # Don't include setup.
test_size = 0.4
min_dfs = [50, 100, 200, 300, 500]
for letter_pair in letters:
    test_letter = letter_pair[0]
    print('Analyzing tweets at the author level:', end=' ')
    print(f'{test_letter}/{letter_pair[1]}')
    
    best_min_df, best_alpha, best_test_score = 0, 0, 0
    for min_df in min_dfs:
        for alpha in np.power(10., np.arange(-2, 3)):
            author_results = analyze_tweets(
                tweets_per_author, letter=test_letter, classifier=NAIVE_BAYES,
                min_df=min_df, alpha=alpha, test_size=test_size)
            train_score, test_score = round(author_results[0], 4),\
                round(author_results[1], 4)
            if test_score > best_test_score:
                best_min_df, best_alpha, best_test_score\
                    = min_df, alpha, test_score
    print('\tBest min_df, alpha, and score for naive Bayes:',
          best_min_df, best_alpha, best_test_score)

    best_min_df, best_max_depth, best_test_score = 0, 0, 0
    for min_df in min_dfs:
        for max_depth in [2, 3, 4, 5]:
            author_results = analyze_tweets(
                tweets_per_author, letter=test_letter, classifier=RAND_FOREST,
                min_df=min_df, max_depth=max_depth, test_size=test_size)
            train_score, test_score = round(author_results[0], 4),\
                round(author_results[1], 4)
            if test_score > best_test_score:
                best_min_df, best_max_depth, best_test_score\
                    = min_df, max_depth, test_score
    print('\tBest min_df, max_depth, and score for random forests:',
          best_min_df, best_max_depth, best_test_score)

    best_min_df, best_C, best_test_score = 0, 0, 0
    for min_df in min_dfs:
        for C in np.power(10., np.arange(-2, 3)):
            author_results = analyze_tweets(
                tweets_per_author, letter=test_letter, classifier=LOGISTIC,
                min_df=min_df, C=C, test_size=test_size, max_iter=1e4)
            train_score, test_score = round(author_results[0], 4),\
                round(author_results[1], 4)
            if test_score > best_test_score:
                best_min_df, best_C, best_test_score\
                    = min_df, C, test_score
    print('\tBest min_df, C, and score for logistic regression:',
          best_min_df, best_C, best_test_score)

print('\aElapsed grid searching time: ' + stopwatch.getElapsedTime())

Analyzing tweets at the author level: E/I
	Best min_df, alpha, and score for naive Bayes: 200 0.01 0.5891
	Best min_df, max_depth, and score for random forests: 300 4 0.5859
	Best min_df, C, and score for logistic regression: 100 0.01 0.5766
Analyzing tweets at the author level: S/N
	Best min_df, alpha, and score for naive Bayes: 200 0.01 0.6594
	Best min_df, max_depth, and score for random forests: 50 3 0.6234
	Best min_df, C, and score for logistic regression: 50 0.01 0.5766
Analyzing tweets at the author level: F/T
	Best min_df, alpha, and score for naive Bayes: 500 100.0 0.5969
	Best min_df, max_depth, and score for random forests: 200 3 0.6031
	Best min_df, C, and score for logistic regression: 200 0.01 0.5812
Analyzing tweets at the author level: J/P
	Best min_df, alpha, and score for naive Bayes: 50 1.0 0.6
	Best min_df, max_depth, and score for random forests: 200 2 0.6
	Best min_df, C, and score for logistic regression: 200 0.01 0.5734
Elapsed grid searching time: --- 23.9 mi

In most cases, naive Bayes fares the best, and in the one case where random
forests edges it out, the difference is minor. Let's repeat the process, this
time over just the naive Bayes and see what our results are.

In [11]:
min_dfs = [50, 100, 200, 300, 500]
for letter_pair in letters:
    test_letter = letter_pair[0]
    print('\nAnalyzing tweets at the author level:', end=' ')
    print(f'{test_letter}/{letter_pair[1]}')
    
    best_min_df, best_alpha, best_test_score = 0, 0, 0
    for min_df in min_dfs:
        for alpha in np.power(10., np.arange(-2, 3)):
            author_results = analyze_tweets(
                tweets_per_author, letter=test_letter, classifier=NAIVE_BAYES,
                min_df=min_df, alpha=alpha, test_size=test_size)
            train_score, test_score = round(author_results[0], 4),\
                round(author_results[1], 4)
            if test_score > best_test_score:
                best_min_df, best_alpha, best_test_score\
                    = min_df, alpha, test_score
            print(f'\tmin_df={min_df}, alpha={alpha}, train score='
                  + f'{train_score}, test score={test_score}')
            
    print(f'Best min_df, alpha, and score for {test_letter}:',
          best_min_df, best_alpha, best_test_score)


Analyzing tweets at the author level: E/I
	min_df=50, alpha=0.01, train score=0.8188, test score=0.5734
	min_df=50, alpha=0.1, train score=0.8188, test score=0.5734
	min_df=50, alpha=1.0, train score=0.8177, test score=0.5734
	min_df=50, alpha=10.0, train score=0.8083, test score=0.575
	min_df=50, alpha=100.0, train score=0.7729, test score=0.5812
	min_df=100, alpha=0.01, train score=0.7729, test score=0.5812
	min_df=100, alpha=0.1, train score=0.7729, test score=0.5812
	min_df=100, alpha=1.0, train score=0.7719, test score=0.5812
	min_df=100, alpha=10.0, train score=0.7688, test score=0.5859
	min_df=100, alpha=100.0, train score=0.7479, test score=0.5781
	min_df=200, alpha=0.01, train score=0.7271, test score=0.5891
	min_df=200, alpha=0.1, train score=0.7271, test score=0.5891
	min_df=200, alpha=1.0, train score=0.726, test score=0.5891
	min_df=200, alpha=10.0, train score=0.7219, test score=0.5875
	min_df=200, alpha=100.0, train score=0.7031, test score=0.575
	min_df=300, alpha=0.01

Low values of min_df badly overfit the data. These models probably rely
on words that rarely show up and happen to particularly fit the given tweets
but cannot be generalized well. However, we would like to keep min_df as low
as possible to avoid throwing out too much. Furthermore, since the test scores
do not vary much within each axis, we can select one set of hyperparameters
for the whole test. That selection is min_df=200 and alpha=1.