# Imports

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score

# Preprocessing

In [24]:
# Dataset CSV Paths
training_path = '/Users/chris/Documents/CS 5100/Final Project/twitter_training.csv'
testing_path = '/Users/chris/Documents/CS 5100/Final Project/twitter_validation.csv'

# Import data
train_data = pd.read_csv(training_path)
test_data = pd.read_csv(testing_path)

# drop irrelevant columns
train_data.drop(['2401', 'Borderlands'], axis = 1, inplace = True)
test_data.drop(['3364', 'Facebook'], axis = 1, inplace = True)

# rename columns
train_data = train_data.rename(columns = {"Positive" : "sentiment", "im getting on borderlands and i will murder you all ," : "text"})
test_data = test_data.rename(columns = {"Irrelevant" : "sentiment", "I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣" : "text"})
test_data.head()

Unnamed: 0,sentiment,text
0,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,Negative,@Microsoft Why do I pay for WORD when it funct...
2,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,Neutral,Now the President is slapping Americans in the...
4,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [25]:
# remove null values
train_data = train_data.dropna()
test_data = test_data.dropna()

# get rid of extra labels
train_data = train_data[~train_data['sentiment'].isin(['Neutral', 'Irrelevant'])].reset_index(drop=True)
test_data = test_data[~test_data['sentiment'].isin(['Neutral', 'Irrelevant'])].reset_index(drop=True)

def tweet_to_words(tweet):
    letters_only = re.sub("[^a-zA-Z]", " ",tweet) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops] 
    return( " ".join( meaningful_words ))

nltk.download('stopwords')
train_data['clean_tweet']=train_data['text'].apply(lambda x: tweet_to_words(x))
test_data['clean_tweet']=test_data['text'].apply(lambda x: tweet_to_words(x))
train_data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,sentiment,text,clean_tweet
0,Positive,I am coming to the borders and I will kill you...,coming borders kill
1,Positive,im getting on borderlands and i will kill you ...,im getting borderlands kill
2,Positive,im coming on borderlands and i will murder you...,im coming borderlands murder
3,Positive,im getting on borderlands 2 and i will murder ...,im getting borderlands murder
4,Positive,im getting into borderlands and i can murder y...,im getting borderlands murder


# Vectorize

In [26]:
x_train = train_data.clean_tweet
y_train = train_data.sentiment

x_test = test_data.clean_tweet
y_test = test_data.sentiment
print(len(x_train), len(y_train))

43012 43012


In [27]:
# Split data
# x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(x_train)

CountVectorizer()

In [29]:
# Use the trained to create a document-term matrix from train and test sets
x_train_dtm = vect.transform(x_train)
x_test_dtm = vect.transform(x_test)

In [30]:
vect_tunned = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.1, max_df=0.7, max_features=100)
vect_tunned

CountVectorizer(max_df=0.7, max_features=100, min_df=0.1, ngram_range=(1, 2),
                stop_words='english')

In [31]:
#training SVM model with linear kernel
#Support Vector Classification-wrapper around SVM
from sklearn.svm import SVC
model = SVC(kernel='linear', random_state = 10)
model.fit(x_train_dtm, y_train)
#predicting output for test data
pred = model.predict(x_test_dtm)

In [32]:
#accuracy score
accuracy_score(y_test,pred)

0.9723756906077348