#### Import Libraries

In [1]:
import re
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import nltk
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

#### Load Dataset:

In [6]:
data = pd.read_csv('data/sentiment.tsv', sep='\t')
data.columns = ['sentiment', 'tweets']
data.head()

Unnamed: 0,sentiment,tweets
0,neg,"@jamielewislewis i cant believe it, it really ..."
1,neg,Had a dream about a walk in fast food resturau...
2,neg,hates @internet @explrer (angry)(angry) **but ...
3,neg,@federalcase I said I go out for eat 5:negneg...
4,neg,@babykates7 yeah they won't do the surgery til...


### Pre-Processing:

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['sentiment'] = le.fit_transform(data['sentiment'])
data.tail()

Unnamed: 0,sentiment,tweets
1996,1,Just smashed this Tommy's for my 2PM breakfast
1997,1,@ionacosmetics spent yesterday outside myself-...
1998,1,@Covergirlneg8 I take pride in what I do
1999,1,heading to work on the 6
2000,1,@queith asi es!


##### Cleaning Data:

In [9]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

##### Removing twitter handles (eg. @user)

In [10]:
data['clean_tweet'] = np.vectorize(remove_pattern)(data['tweets'], "@[\w]*")
data.head()

Unnamed: 0,sentiment,tweets,clean_tweet
0,0,"@jamielewislewis i cant believe it, it really ...","i cant believe it, it really doesnt belong th..."
1,0,Had a dream about a walk in fast food resturau...,Had a dream about a walk in fast food resturau...
2,0,hates @internet @explrer (angry)(angry) **but ...,hates (angry)(angry) **but no choice** http...
3,0,@federalcase I said I go out for eat 5:negneg...,I said I go out for eat 5:negneg p.m. I dis...
4,0,@babykates7 yeah they won't do the surgery til...,yeah they won't do the surgery till the cold ...


##### Removing Special characters, numbers, and punctuations

In [11]:
data['clean_tweet'] = data['clean_tweet'].str.replace("[^a-zA-Z#]", " ")
data.head()

  data['clean_tweet'] = data['clean_tweet'].str.replace("[^a-zA-Z#]", " ")


Unnamed: 0,sentiment,tweets,clean_tweet
0,0,"@jamielewislewis i cant believe it, it really ...",i cant believe it it really doesnt belong th...
1,0,Had a dream about a walk in fast food resturau...,Had a dream about a walk in fast food resturau...
2,0,hates @internet @explrer (angry)(angry) **but ...,hates angry angry but no choice http...
3,0,@federalcase I said I go out for eat 5:negneg...,I said I go out for eat negneg p m I dis...
4,0,@babykates7 yeah they won't do the surgery til...,yeah they won t do the surgery till the cold ...
