Instruction to run program:
1. open this jupyter notebook on google colab
2. run code from top to bottom. might need to edit path for loading dataset

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

%matplotlib inline


In [2]:
# connect to google drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# load data
%cd /content/gdrive/My Drive/Colab Notebooks/CS4248_group_project/raw_data
fulltrain = pd.read_csv("fulltrain.csv", names=['label', 'text'])
balancedtest = pd.read_csv("balancedtest.csv", names=['label', 'text'])

# Print number of rows per dataset
print(f"fulltrain:  Loaded {len(fulltrain.index)} rows")
print(f"balancedtest: Loaded {len(balancedtest.index)} rows")

/content/gdrive/My Drive/Colab Notebooks/CS4248_group_project/raw_data
fulltrain:  Loaded 48854 rows
balancedtest: Loaded 3000 rows


In [5]:
# process training data
trainX = fulltrain['text']
trainY = fulltrain['label']

# feature engineering, tf-idf
vectorizer = TfidfVectorizer()
vectorizer.fit(trainX)
trainX = vectorizer.transform(trainX)
print(trainX.shape)
print(trainY.shape)

(48854, 229597)
(48854,)


In [6]:
# # model, find best hyper-parameter
# model = LogisticRegression()
# # use cross validation to find a best hyper parameters
# solvers = ['newton-cg', 'lbfgs', 'liblinear']
# penalty = ['l2']
# c_values = [100, 10, 1.0, 0.1, 0.01]
# # define grid search
# grid = dict(solver=solvers, penalty=penalty, C=c_values)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
# grid_result = grid_search.fit(trainX, trainY)
# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#   print("%f (%f) with: %r" % (mean, stdev, param))
# model = grid_result.best_estimator_


In [7]:
# fit model based on test data, 
# params are chosen based on hyperparam tuning using GridSearchCV from sklearn
# default max_iter leads to underfit, thus set to 5000
model = LogisticRegression(C=10, penalty='l2', solver='liblinear', max_iter=5000)
model.fit(trainX, trainY)


LogisticRegression(C=10, max_iter=5000, solver='liblinear')

In [8]:
# process test data
testX = balancedtest['text']
testY = balancedtest['label']
testX = vectorizer.transform(testX)

In [9]:
# test model
result = model.predict(testX)
metrics = precision_recall_fscore_support(testY, result, average='macro')
print("precision, recall, fscore:")
print(metrics)
acc = accuracy_score(testY, result)
print("accuracy:")
print(acc)


precision, recall, fscore:
(0.7665878365648181, 0.7366666666666666, 0.7247357875908803, None)
accuracy:
0.7366666666666667
