In [1]:
# Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import string
import re

# Get Packages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

#DOWNLOADED
#nltk.download('wordnet')
#nltk.download('stopwords')

#NOT DOWNLOADED
# nltk.download('punkt')
# nltk.download('corpus')


# URLs
url_tweets_keyword = 'https://www.usna.edu/Users/cs/nchamber/data/twitter/keyword-tweets.txt'
url_tweets_general = 'https://www.usna.edu/Users/cs/nchamber/data/twitter/general-tweets.txt'

# Reading data
df_tweets_keyword = pd.read_csv(url_tweets_keyword, sep='\t')
df_tweets_general = pd.read_csv(url_tweets_general, sep='\t')

In [2]:
display(df_tweets_keyword.shape)
display(df_tweets_general.shape)

(2003, 2)

(1999, 2)

In [3]:
df_tweets_keyword['POLIT'].value_counts()

POLIT
POLIT    1690
NOT       313
Name: count, dtype: int64

In [4]:
df_tweets_keyword.head()

Unnamed: 0,POLIT,"Global Voices Online Â» Alex Castro: A liberal, libertarian and libertine Brazilian blogger http://ff.im/-6izrC"
0,POLIT,Do the Conservatives Have a Death Wish? http:/...
1,NOT,@MMFlint I've seen all of your movies and Capi...
2,POLIT,RT @AllianceAlert: * House Dems ask for civili...
3,POLIT,RT @AdamSmithInst Quote of the week: My politi...
4,NOT,"@DeeptiLamba LOL, I like quotes. Feminist, ant..."


In [5]:
df_tweets_general['NOT'].value_counts()

NOT
NOT      1971
POLIT      28
Name: count, dtype: int64

In [6]:
df_tweets_general.head()

Unnamed: 0,NOT,Bumping dj sefs mixtape nowww this is my music new skooooool
0,NOT,#ieroween THE STORY OF IEROWEEN! THE VIDEO ->>...
1,NOT,trick or treating at the mall today; ZOO! last...
2,NOT,@Ussk81 PMSL!!! I try not to stare but I can't...
3,NOT,@Sc0rpi0n676 btw - is there a remote chance i ...
4,NOT,So... was that my invite to whoop ur ass? Soun...


In [7]:
# Concatenating "df_tweets_keyword" and "df_w=tweets_general" into One DataFrame & Renaming Columns to "Sentiment" & "Tweets"
LabeledTweets = pd.concat([
    df_tweets_keyword.rename(columns={df_tweets_keyword.columns[0]: "Sentiment", df_tweets_keyword.columns[1]: "Tweets"}),
    df_tweets_general.rename(columns={df_tweets_general.columns[0]: "Sentiment", df_tweets_general.columns[1]: "Tweets"})
], ignore_index=True)

display(LabeledTweets.shape)

# Displaying the 1st Few Rows of the Concatenated DataFrame
LabeledTweets.head()

(4002, 2)

Unnamed: 0,Sentiment,Tweets
0,POLIT,Do the Conservatives Have a Death Wish? http:/...
1,NOT,@MMFlint I've seen all of your movies and Capi...
2,POLIT,RT @AllianceAlert: * House Dems ask for civili...
3,POLIT,RT @AdamSmithInst Quote of the week: My politi...
4,NOT,"@DeeptiLamba LOL, I like quotes. Feminist, ant..."


In [8]:
# replace 'POLIT': 1, 'NOT': 0
LabeledTweets['Sentiment'] = LabeledTweets['Sentiment'].replace(
    {'POLIT': 1, 'NOT': 0}
)

LabeledTweets.head()

Unnamed: 0,Sentiment,Tweets
0,1,Do the Conservatives Have a Death Wish? http:/...
1,0,@MMFlint I've seen all of your movies and Capi...
2,1,RT @AllianceAlert: * House Dems ask for civili...
3,1,RT @AdamSmithInst Quote of the week: My politi...
4,0,"@DeeptiLamba LOL, I like quotes. Feminist, ant..."


In [9]:
# Cleaning Tweets Function
def preprocess(text, cleaning_steps):
    for step in cleaning_steps:
# Removing all Tokens that Contain a "@", Not Just the Character
        if step == 'remove_at_sign':
            text = ' '.join([word for word in text.split() if '@' not in word])
# Removing all Tokens that Contain a "http", Not Just the Character
        elif step == 'remove_http':
            text = ' '.join([word for word in text.split() if 'http' not in word])
# Replace all Punctuation Marks w/ a Space
        elif step == 'replace_punctuation_marks':
            text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
# Replace all Numbers w/ a Space
        elif step == 'replace_numbers':
            text = re.sub(r'\d+', ' ', text)
# Replace all Non-Ascii Characters w/ a Space
        elif step == 'replace_non_ascii':
            text = re.sub(r'[^\x00-\x7F]+', ' ', text)
# Converting all Characters to Lowercase
        elif step == 'lowercase':
            text = text.lower()
# Striping Extra Whitespace
        elif step == 'strip_whitespace':
            text = ' '.join(text.split())
# Lemmatize Tokens
        elif step == 'lemmatize':
            lmtzr = WordNetLemmatizer()
            word_list = text.split(' ')
            stemmed_words = [lmtzr.lemmatize(word) for word in word_list]
            text = ' '.join(stemmed_words)
    return text

# Cleaning Steps
cleaning_steps = ['remove_at_sign', 
                  'remove_http', 
                  'replace_punctuation_marks',
                  'replace_numbers',
                  'replace_non_ascii',
                  'lowercase',
                  'strip_whitespace',
                  'lemmatize'
                 ]

# Appplying the Cleaning Function "preprocess" to Column "Tweets"
LabeledTweets['Clean_Tweets'] = LabeledTweets['Tweets'].map(lambda s: preprocess(s, cleaning_steps))

LabeledTweets.head(20)

Unnamed: 0,Sentiment,Tweets,Clean_Tweets
0,1,Do the Conservatives Have a Death Wish? http:/...,do the conservative have a death wish
1,0,@MMFlint I've seen all of your movies and Capi...,i ve seen all of your movie and capitalism is ...
2,1,RT @AllianceAlert: * House Dems ask for civili...,rt house dems ask for civility at town hall an...
3,1,RT @AdamSmithInst Quote of the week: My politi...,rt quote of the week my political opinion lean...
4,0,"@DeeptiLamba LOL, I like quotes. Feminist, ant...",lol i like quote feminist anti men quote
5,1,@mystic23 I also think that most liberals don'...,i also think that most liberal don t spend a l...
6,1,@Karoli check @RepJackKimble explains brownshi...,check explains brownshirt conservative action ...
7,1,Finally US asks for extradition of Polanski - ...,finally u asks for extradition of polanski wha...
8,1,RT @Shoq: FUNNY! Teabagger @sanuzis quoting (b...,rt funny teabagger quoting but not understandi...
9,1,"progressives continue to fight for both ""Medic...",progressive continue to fight for both medicar...


In [10]:
# Declare the TFIDF Vectorizer w/ "max_features = 50"
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, max_features=50, stop_words='english')

In [11]:
# Using sklearn LogisticRegression to Train a Model
# 75% Training - 25% Testing
X_train, X_test, y_train, y_test = train_test_split(
    LabeledTweets['Clean_Tweets'], LabeledTweets['Sentiment'], test_size = 0.25, random_state = 42
)

print("Training data type and shape:", type(X_train), X_train.shape)

# Train the Logistic Classifer on the Data
model = make_pipeline(vectorizer, LogisticRegression())
model.fit(X_train, y_train)

# Dtermining the Accuracy of the Model ~ Training Data & Testing Data
# Evaluate the Model
train_results = model.predict(X_train)
test_results = model.predict(X_test)
train_acc = np.mean(y_train == train_results)
test_acc = np.mean(y_test == test_results)

# Determining the Baseline Accuracy
baseline_accuracy = np.max([np.mean(y_test == 1), np.mean(y_test == 0)])

print('Train accuracy: {}'.format(train_acc))
print('Test accuracy: {}'.format(test_acc))
print('Variance: {}'.format(train_acc - test_acc))
print('Baseline accuracy: {}'.format(baseline_accuracy))

Training data type and shape: <class 'pandas.core.series.Series'> (3001,)
Train accuracy: 0.8790403198933688
Test accuracy: 0.8641358641358642
Variance: 0.014904455757504653
Baseline accuracy: 0.5494505494505495


In [12]:
# Repeating Steps 5, 6, and 7 with TfidfVectorizer max_features Set to 5, 500, 5000, 50000

# Setting the TfidfVectorizer max_features Set
max_features_x = [5, 500, 5000, 50000]

for i in max_features_x:
    print(f"\n***** MAX_FEATURES={i} *****")
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, max_features=i, stop_words='english')
    model = make_pipeline(vectorizer, LogisticRegression())
    model.fit(X_train, y_train)
    train_results = model.predict(X_train)
    test_results = model.predict(X_test)
    train_acc = np.mean(y_train == train_results)
    test_acc = np.mean(y_test == test_results)
    baseline_accuracy = np.max([np.mean(y_test == 1), np.mean(y_test == 0)])
    print('Train accuracy: {}'.format(train_acc))
    print('Test accuracy: {}'.format(test_acc))
    print('Variance: {}'.format(train_acc - test_acc))
    print('Baseline accuracy: {}'.format(baseline_accuracy))
    


***** MAX_FEATURES=5 *****
Train accuracy: 0.7464178607130957
Test accuracy: 0.7232767232767233
Variance: 0.02314113743637236
Baseline accuracy: 0.5494505494505495

***** MAX_FEATURES=500 *****
Train accuracy: 0.9183605464845052
Test accuracy: 0.8881118881118881
Variance: 0.030248658372617032
Baseline accuracy: 0.5494505494505495

***** MAX_FEATURES=5000 *****
Train accuracy: 0.9536821059646784
Test accuracy: 0.8871128871128872
Variance: 0.06656921885179123
Baseline accuracy: 0.5494505494505495

***** MAX_FEATURES=50000 *****
Train accuracy: 0.9563478840386538
Test accuracy: 0.8871128871128872
Variance: 0.06923499692576662
Baseline accuracy: 0.5494505494505495


**SUMMARY**
<br>Both the Train & Test Accuracies increase when the "max_features" increase showing the model is learning more from the features. The Vairance between the Train & Test Accuracies are small showing no overfitting is happening. The Baseline Accuracy is staying around 55% showing my model is performing significantly better than a naive baseline.