Import necessary libraries

In [1]:
import pyforest
import re
import string
import nltk
import warnings
%matplotlib inline

warnings.filterwarnings('ignore')

Import our CSV file and assign columns

In [2]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding= 'latin-1', names= ['Sentiment','ID','Date', 'Query','User','Tweet'])

df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,Sentiment,ID,Date,Query,User,Tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


Data cleaning and preprocessing

In [3]:
df = df.drop(columns=['ID', 'Date', 'Query', 'User'])  #Drop unwanted columns
df['Sentiment'] = df['Sentiment'].replace(4,1) #Change positive tweet from '4' to '1'
df.head()

Unnamed: 0,Sentiment,Tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [4]:
#Removes pattern in the input text
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt

In [5]:
#Remove twitter handles (@User)
df['clean_tweet'] = np.vectorize(remove_pattern)(df['Tweet'], "@[\w]*")

<IPython.core.display.Javascript object>

In [6]:
#Remove special characters, numbers and punctuations
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z#]", " ")
df.head()

Unnamed: 0,Sentiment,Tweet,clean_tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",http twitpic com y zl Awww that s a bum...
1,0,is upset that he can't update his Facebook by ...,is upset that he can t update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball Managed to s...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....",no it s not behaving at all i m mad why am...


In [7]:
#Remove short words
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
df.head()

Unnamed: 0,Sentiment,Tweet,clean_tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",http twitpic Awww that bummer shoulda David Ca...
1,0,is upset that he can't update his Facebook by ...,upset that update Facebook texting might resul...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball Managed save rest bounds
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....",behaving here because over there


In [8]:
#Tokenizing the words
tokenized_tweet = df['clean_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [http, twitpic, Awww, that, bummer, shoulda, D...
1    [upset, that, update, Facebook, texting, might...
2    [dived, many, times, ball, Managed, save, rest...
3              [whole, body, feels, itchy, like, fire]
4               [behaving, here, because, over, there]
Name: clean_tweet, dtype: object

In [9]:
#Word stemming
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
tokenized_tweet.head()

0    [http, twitpic, awww, that, bummer, shoulda, d...
1    [upset, that, updat, facebook, text, might, re...
2    [dive, mani, time, ball, manag, save, rest, bo...
3               [whole, bodi, feel, itchi, like, fire]
4                   [behav, here, becaus, over, there]
Name: clean_tweet, dtype: object

In [10]:
#Combine the words into a single sentence
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = " ".join(tokenized_tweet[i])
    
df['clean_tweet'] = tokenized_tweet
df.head()

Unnamed: 0,Sentiment,Tweet,clean_tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",http twitpic awww that bummer shoulda david ca...
1,0,is upset that he can't update his Facebook by ...,upset that updat facebook text might result sc...
2,0,@Kenichan I dived many times for the ball. Man...,dive mani time ball manag save rest bound
3,0,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",behav here becaus over there


In [11]:
#Feature extraction
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=100000, stop_words='english')
bow = bow_vectorizer.fit_transform(df['clean_tweet'])

Input split

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(bow, df['Sentiment'], random_state=42, test_size=0.3)

Model training

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score


In [14]:
model = LogisticRegression()
model.fit(x_train, y_train)

In [15]:
#Predictions and accuracy
pred = model.predict(x_test)
accuracy_score(y_test,pred)

0.7551270833333333