In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('Data/twitter.csv')

In [5]:
df = df.iloc[:,2:]
df

Unnamed: 0,sentiment,review
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
74677,Positive,Just realized that the Windows partition of my...
74678,Positive,Just realized that my Mac window partition is ...
74679,Positive,Just realized the windows partition of my Mac ...
74680,Positive,Just realized between the windows partition of...


In [6]:
df['sentiment'] = df['sentiment'].apply(lambda x: 0 if x == 'Negative' else 1)
df

Unnamed: 0,sentiment,review
0,1,im getting on borderlands and i will murder yo...
1,1,I am coming to the borders and I will kill you...
2,1,im getting on borderlands and i will kill you ...
3,1,im coming on borderlands and i will murder you...
4,1,im getting on borderlands 2 and i will murder ...
...,...,...
74677,1,Just realized that the Windows partition of my...
74678,1,Just realized that my Mac window partition is ...
74679,1,Just realized the windows partition of my Mac ...
74680,1,Just realized between the windows partition of...


In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_tweet = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_tweet)

In [10]:
import re
import nltk

def preprocess(text):
    text = str(text)
    text = ' '.join(text.split())
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = remove_stopwords(text)

    return text

In [11]:
df['review'] = df['review'].apply(preprocess)
df

Unnamed: 0,sentiment,review
0,1,im getting borderlands murder
1,1,coming borders kill
2,1,im getting borderlands kill
3,1,im coming borderlands murder
4,1,im getting borderlands 2 murder
...,...,...
74677,1,realized windows partition mac like 6 years be...
74678,1,realized mac window partition 6 years behind n...
74679,1,realized windows partition mac 6 years behind ...
74680,1,realized windows partition mac like 6 years be...


In [12]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'] , 
                                   random_state=104,  
                                   test_size=0.1,  
                                   shuffle=True)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

x = vectorizer.fit_transform(df['review'])     

In [14]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(x, df['sentiment'] , 
                                   random_state=104,  
                                   test_size=0.1,  
                                   shuffle=True)

In [15]:
X_test

<7469x15724 sparse matrix of type '<class 'numpy.float64'>'
	with 73268 stored elements in Compressed Sparse Row format>

In [16]:
import time
from sklearn import svm
from sklearn.metrics import classification_report

classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(X_train, y_train)
prediction_linear = classifier_linear.predict(X_test)

In [17]:
report = classification_report(y_test, prediction_linear, output_dict=True)

In [18]:
report

{'0': {'precision': 0.8840579710144928,
  'recall': 0.7707581227436823,
  'f1-score': 0.8235294117647058,
  'support': 2216.0},
 '1': {'precision': 0.908253566913491,
  'recall': 0.9573577003616981,
  'f1-score': 0.9321594068582021,
  'support': 5253.0},
 'accuracy': 0.9019949123041907,
 'macro avg': {'precision': 0.8961557689639919,
  'recall': 0.8640579115526902,
  'f1-score': 0.8778444093114539,
  'support': 7469.0},
 'weighted avg': {'precision': 0.9010749030345004,
  'recall': 0.9019949123041907,
  'f1-score': 0.8999296479711774,
  'support': 7469.0}}