In [1]:
import pandas as pd

# Step 1: Exploratory Data Analysis

In [2]:
# load in datasets
training_df = pd.read_csv("/users/chris/pythonML/train.csv")
testing_df = pd.read_csv("/users/chris/pythonML/test.csv")

training_size = len(training_df) # get size of train dataset
print("The total number of rows in train.csv: " + str(training_size))

testing_size = len(testing_df) # get size of test dataset
print("The total number of rows in test.csv: " + str(testing_size))

The total number of rows in train.csv: 1048575
The total number of rows in test.csv: 359


In [3]:
# count the number of positive and negative data points for train.csv
num_positive = len(training_df[training_df['Sentiment'] == 1]) # 1 == positive
print("The total number of positive datapoints in train.csv: " + str(num_positive))
num_negative = len(training_df[training_df['Sentiment'] == 0]) # 0 == negative
print("The total number of negative datapoints in train.csv: " + str(num_negative))
# count the number of positive and negative data points for test.csv
num_positive = len(testing_df[testing_df['Sentiment'] == 1])
print("The total number of positive datapoints in test.csv: " + str(num_positive))
num_negative = len(testing_df[testing_df['Sentiment'] == 0])
print("The total number of negative datapoints in test.csv: " + str(num_negative))

The total number of positive datapoints in train.csv: 248575
The total number of negative datapoints in train.csv: 800000
The total number of positive datapoints in test.csv: 182
The total number of negative datapoints in test.csv: 177


In [4]:
# check if data sets have any empty values
print(training_df.isnull().values.any())
print(testing_df.isnull().values.any())

False
False


# Step 2: Text Preprocessing 

In [5]:
# convert all words to lower case in our datasets' discussion texts
training_df['Text'] = training_df['Text'].str.lower();
testing_df['Text'] = testing_df['Text'].str.lower();
# remove digital numbers and special characters from our datasets' discussion texts
training_df['Text'] = training_df['Text'].str.replace('[^a-zA-Z ]','', regex=True)
testing_df['Text'] = testing_df['Text'].str.replace('[^a-zA-Z ]','', regex=True)

# Step 3: Linguistic Feature Extraction

In [6]:
# since train.csv is too large to process, we need to get a random subset of the dataframe to perform classification models
train_subset = training_df.sample(n=15000, random_state=42) # get subset of 100,000 rows

In [7]:
# extract linguistic features using tf*idf
from sklearn.feature_extraction.text import TfidfVectorizer 

tf_idf_model = TfidfVectorizer() # create an instance of the TfidfVectorizer class
tf_idf_training = tf_idf_model.fit_transform(train_subset['Text']) # fit and transform train.csv text to generate matrix of tf-idf weights
print("Tf*idf feature extraction on training dataset:")
print(tf_idf_training)

# do the same for test.csv
tf_idf_testing = tf_idf_model.transform(testing_df['Text'])
print("Tf*idf feature extraction on testing dataset:")
print(tf_idf_testing)




Tf*idf feature extraction on training dataset:
  (0, 24763)	0.24895649564318997
  (0, 20962)	0.3706623500033639
  (0, 9182)	0.1910558669130168
  (0, 22357)	0.08168156703437544
  (0, 18268)	0.3296238032191935
  (0, 1794)	0.13509209827149246
  (0, 23853)	0.3706623500033639
  (0, 15543)	0.1419572734348018
  (0, 11193)	0.2578346952641455
  (0, 10503)	0.1190258185859985
  (0, 2771)	0.2659431170843537
  (0, 20918)	0.30074137606212475
  (0, 8755)	0.16640353710342626
  (0, 16371)	0.3238655207467009
  (0, 21893)	0.12891741213817687
  (0, 12161)	0.16632481232173474
  (0, 15502)	0.1312724293531593
  (0, 5535)	0.19261254611504044
  (1, 23923)	0.40753113086212206
  (1, 10571)	0.1250560779971536
  (1, 20688)	0.34575794208200933
  (1, 15672)	0.34575794208200933
  (1, 22703)	0.40753113086212206
  (1, 1798)	0.28449495222825516
  (1, 1629)	0.40753113086212206
  :	:
  (14998, 7909)	0.23106960082716574
  (14998, 708)	0.17002884234928728
  (14998, 23864)	0.21631171769287505
  (14998, 9584)	0.16738800056414

In [8]:
# extract linguistic features using bag-of-words
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer() # create an instance of CountVectorizer
bow_matrix_train = bow.fit_transform(train_subset['Text']) # fit the vectorizer to train.csv text and transform into a bow matrix
print("Bag-of-words feature extraction on training dataset:")
print(bow_matrix_train)

# do the same for test.csv
bow_matrix_test = bow.transform(testing_df['Text']) 
print("Bag-of-words feature extraction on testing dataset:")
print(bow_matrix_test)

Bag-of-words feature extraction on training dataset:
  (0, 5535)	1
  (0, 15502)	1
  (0, 12161)	1
  (0, 21893)	1
  (0, 16371)	1
  (0, 8755)	1
  (0, 20918)	1
  (0, 2771)	1
  (0, 10503)	1
  (0, 11193)	1
  (0, 15543)	1
  (0, 23853)	1
  (0, 1794)	1
  (0, 18268)	1
  (0, 22357)	1
  (0, 9182)	1
  (0, 20962)	1
  (0, 24763)	1
  (1, 19648)	1
  (1, 1629)	1
  (1, 1798)	1
  (1, 22703)	1
  (1, 15672)	1
  (1, 20688)	1
  (1, 10571)	1
  :	:
  (14998, 10920)	1
  (14998, 19392)	1
  (14998, 20982)	1
  (14998, 19378)	1
  (14998, 10998)	2
  (14998, 20773)	1
  (14998, 8984)	1
  (14998, 9584)	1
  (14998, 23864)	1
  (14998, 708)	1
  (14998, 7909)	1
  (14998, 827)	1
  (14998, 22241)	1
  (14998, 14431)	1
  (14998, 5347)	1
  (14998, 22661)	1
  (14999, 11152)	1
  (14999, 21914)	1
  (14999, 19801)	1
  (14999, 24357)	1
  (14999, 7303)	1
  (14999, 22486)	1
  (14999, 23928)	1
  (14999, 18950)	1
  (14999, 12636)	1
Bag-of-words feature extraction on testing dataset:
  (0, 2977)	1
  (0, 4415)	1
  (0, 7084)	1
  (0, 10571)	

# Step 4: Build your sentiment classification model

In [9]:
# imports for machine learning classification models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [10]:
# set y_train and y_test to sentiment portion of database
y_train = train_subset['Sentiment']
y_test = testing_df['Sentiment']

# create logistic regression model on bag-of-words feature extraction 
lc_bag = LogisticRegression(random_state=42)
lc_bag.fit(bow_matrix_train, y_train)

y_lc_bag_predicted = lc_bag.predict(bow_matrix_test) # predict test data

print(classification_report(y_test, y_lc_bag_predicted)) # print results of logistic regression model w/ bow

              precision    recall  f1-score   support

           0       0.65      0.94      0.77       177
           1       0.90      0.51      0.65       182

    accuracy                           0.72       359
   macro avg       0.78      0.72      0.71       359
weighted avg       0.78      0.72      0.71       359



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [11]:
# create logistic regression model on tf*idf feature extraction
lc_tf_idf = LogisticRegression(random_state=42)
lc_tf_idf.fit(tf_idf_training, y_train)

y_lc_tf_idf_predicted = lc_tf_idf.predict(tf_idf_testing) # predict test data

print(classification_report(y_test, y_lc_tf_idf_predicted)) # print results of logistic regression model w/ tf*idf

              precision    recall  f1-score   support

           0       0.58      0.97      0.73       177
           1       0.92      0.32      0.48       182

    accuracy                           0.64       359
   macro avg       0.75      0.65      0.60       359
weighted avg       0.75      0.64      0.60       359



In [12]:
# create SVM model on bag-of-words feature extraction
svm_bag = SVC(kernel='linear')
svm_bag.fit(bow_matrix_train, y_train)

y_svm_bag_predicted = svm_bag.predict(bow_matrix_test) # predict test data
print(classification_report(y_test, y_svm_bag_predicted)) # print results of SVM model w/ bow

              precision    recall  f1-score   support

           0       0.66      0.87      0.75       177
           1       0.81      0.55      0.66       182

    accuracy                           0.71       359
   macro avg       0.73      0.71      0.70       359
weighted avg       0.74      0.71      0.70       359



In [13]:
# create SVM model on tf*idf feature extraction
svm_tf_idf = SVC(kernel='linear')
svm_tf_idf.fit(tf_idf_training, y_train)

y_svm_tf_idf_predicted = svm_tf_idf.predict(tf_idf_testing) # predict test data
print(classification_report(y_test, y_svm_tf_idf_predicted)) # print results of SVM model w/ tf*idf

              precision    recall  f1-score   support

           0       0.65      0.95      0.78       177
           1       0.92      0.51      0.65       182

    accuracy                           0.73       359
   macro avg       0.79      0.73      0.71       359
weighted avg       0.79      0.73      0.71       359



In [14]:
# create Naive Bayes model on bag-of-words feature extraction
nb_bag = MultinomialNB()
nb_bag.fit(bow_matrix_train, y_train)

y_nb_bag_predicted = nb_bag.predict(bow_matrix_test) # predict test data
print(classification_report(y_test, y_nb_bag_predicted)) # print results of Naive Bayes model w/ bow

              precision    recall  f1-score   support

           0       0.55      0.98      0.71       177
           1       0.93      0.22      0.36       182

    accuracy                           0.60       359
   macro avg       0.74      0.60      0.53       359
weighted avg       0.74      0.60      0.53       359



In [15]:
# create Naive Bayes model on tf*idf feature extraction
nb_tf_idf = MultinomialNB()
nb_tf_idf.fit(tf_idf_training, y_train)

y_nb_tf_idf_predicted = nb_tf_idf.predict(tf_idf_testing) # predict test data
print(classification_report(y_test, y_nb_tf_idf_predicted,zero_division=1)) # print results of Naive Bayes model w/ tf*idf

              precision    recall  f1-score   support

           0       0.49      1.00      0.66       177
           1       1.00      0.00      0.00       182

    accuracy                           0.49       359
   macro avg       0.75      0.50      0.33       359
weighted avg       0.75      0.49      0.33       359



In [16]:
# create Random Forest model on bag-of-words feature extraction
rf_bag = RandomForestClassifier()
rf_bag.fit(bow_matrix_train, y_train)

y_rf_bag_predicted = rf_bag.predict(bow_matrix_test) # predict test data
print(classification_report(y_test, y_rf_bag_predicted)) # print results of Random Forest model w/ bow

              precision    recall  f1-score   support

           0       0.55      0.98      0.71       177
           1       0.91      0.23      0.37       182

    accuracy                           0.60       359
   macro avg       0.73      0.60      0.54       359
weighted avg       0.74      0.60      0.53       359



In [17]:
# create Random Forest model on tf*idf feature extraction
rf_tf_idf = RandomForestClassifier()
rf_tf_idf.fit(tf_idf_training, y_train)

y_rf_tf_idf_predicted = rf_tf_idf.predict(tf_idf_testing) # predict test data
print(classification_report(y_test, y_rf_tf_idf_predicted)) # print results of Random Forest model w/ tf*idf

              precision    recall  f1-score   support

           0       0.55      0.98      0.70       177
           1       0.93      0.21      0.34       182

    accuracy                           0.59       359
   macro avg       0.74      0.60      0.52       359
weighted avg       0.74      0.59      0.52       359

