In [153]:
import pandas as pd
import nltk
import string
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.models import Word2Vec as w2v
from sklearn.decomposition import PCA
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Preprocessing

In [120]:
# constants
training_path = '/Users/jackcarpini/Desktop/Northeastern/Final Project/twitter_training.csv'
testing_path = "/Users/jackcarpini/Desktop/Northeastern/Final Project/twitter_validation.csv"
sw = stopwords.words('english')

# import data
train_df = pd.read_csv(training_path)


# drop irrelevant columns
train_df.drop(['2401', 'Borderlands'], axis = 1, inplace = True)


# rename columns
train_df = train_df.rename(columns = {"Positive" : "Sentiment", "im getting on borderlands and i will murder you all ," : "Text"})

# remove null values
train_df = train_df.dropna()

# get rid of extra labels
train_df = train_df[~train_df['Sentiment'].isin(['Neutral', 'Irrelevant'])].reset_index(drop=True)

# preprocessing 
train_df["Text"] = train_df["Text"].apply(process_text)

# remove duplicate rows
train_df = train_df.drop_duplicates()


trans_table = str.maketrans('', '', string.punctuation)


def process_text(text):
    text = text.lower()
    text = text.translate(trans_table)
    tokens = word_tokenize(text)
    filtered_words = [word for word in tokens if word not in stopwords.words('english')]
    text = ' '.join(filtered_words)
    return text



# Word2Vec

In [122]:
# word2vec model
sentences = [row.split() for row in train_df["Text"]]
model = w2v(sentences, vector_size=200, window=5, min_count=1, workers=4)


In [135]:
print(model.wv.most_similar('sniper'))

[('relaxing', 0.9980916380882263), ('slightly', 0.9977652430534363), ('became', 0.9976387619972229), ('gas', 0.9975066184997559), ('dry', 0.9971792101860046), ('surprising', 0.9971683025360107), ('higher', 0.9971504211425781), ('burning', 0.9969445466995239), ('effort', 0.9969199299812317), ('animation', 0.9968719482421875)]


# TF-IDF

In [147]:
X = train_df['Text']
y = train_df["Sentiment"]
corpus = X
# Initizalize the vectorizer with max nr words and ngrams (1: single words, 2: two words in a row)
vectorizer_tfidf = TfidfVectorizer(max_features=15000, ngram_range=(1,2))
# Fit the vectorizer to the training data
vectorizer_tfidf.fit(corpus)
TfidfVectorizer(max_features=15000, ngram_range=(1, 2))

In [149]:
classifier_tfidf = LogisticRegression()
model_tfidf = Pipeline([("vectorizer", vectorizer_tfidf), ("classifier", classifier_tfidf)])
model_tfidf.fit(X,y)

In [155]:
predicted_train_tfidf = model_tfidf.predict(X)
accuracy_train_tfidf = accuracy_score(y, predicted_train_tfidf)
print('Accuracy Training data: {:.1%}'.format(accuracy_train_tfidf))


Accuracy Training data: 93.2%
