# Sentiment Analysis (Kaggle Challenge)
#### Redicting sentiment from tweets

## 1. Setup Project

In [1]:
import pandas as pd
import re
import nltk
import numpy as np

from sklearn.model_selection import train_test_split # spliting train and test data
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # for transforming text to matrix
from sklearn.naive_bayes import MultinomialNB, GaussianNB # Naive Bayes models
from sklearn import metrics # for scoring
from nltk.corpus import stopwords # for filtering out stop words in text
from nltk.stem import PorterStemmer # for stemming words

### Importing data

In [2]:
traindata = pd.read_csv('train.csv')

In [3]:
traindata.head(5)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative


## 2. Preparing Data

### Removing unneccesary data from Training data

##### Removing empty data in text column 

In [4]:
traindata.dropna(subset=['text'], inplace=True)

##### Removing punctuations

In [5]:
traindata.text = traindata.text.apply(lambda x: re.sub(r'[^\w\s]', '', x) ) 

In [6]:
traindata.head(5)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,Id have responded if I were going,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego,negative
2,088c60f138,my boss is bullying me,negative
3,9642c003ef,what interview leave me alone,negative
4,358bd9e861,Sons of why couldnt they put them on the rel...,negative


##### Removing Stopwords

In [7]:
stopwords = stopwords.words('english')
traindata.text = traindata.text.apply(
    lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords])
)

In [8]:
traindata.head(5)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,Id responded going,neutral
1,549e992a42,Sooo SAD miss San Diego,negative
2,088c60f138,boss bullying,negative
3,9642c003ef,interview leave alone,negative
4,358bd9e861,Sons couldnt put releases already bought,negative


##### Removing links

In [9]:
traindata.text = traindata.text.apply(lambda x: re.sub(r'\(?http\S+', '', x))

### Grouping similar words with Lemmatization

In [10]:
lemmatizer = nltk.WordNetLemmatizer()

 ##### Selecting lemmatization and applying to dataframe


In [11]:
traindata.text = traindata.text.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]))

In [12]:
traindata.head(5)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,Id responded going,neutral
1,549e992a42,Sooo SAD miss San Diego,negative
2,088c60f138,bos bullying,negative
3,9642c003ef,interview leave alone,negative
4,358bd9e861,Sons couldnt put release already bought,negative


### Using Training Data to train a naive bayes sentiment classifier

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import metrics

##### Using CountVectorizer() to transform text into a matrix for machine learning

In [14]:
vectorizer = CountVectorizer()

##### Transforming text column into matrix

In [15]:
traindata.dropna(subset=['text'], inplace=True)
X = vectorizer.fit_transform(traindata.text)

##### Representing a multinomial naive bayes model, and fitting it

In [16]:
nb = MultinomialNB()

#nb.fit(features_matrix, item_we_want_to_predict)
nb.fit(X, traindata.sentiment)

MultinomialNB()

### Setting up Test Data

##### Test data will be preprocessed the same way as Training data

In [17]:
testdata = pd.read_csv('test.csv')
testdata.head(5)

Unnamed: 0,textID,text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,Shanghai is also really exciting (precisely -...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to..."
3,01082688c6,happy bday!
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!


In [18]:
testdata.dropna(subset=['text'], inplace=True)

In [19]:
testdata.text = testdata.text.apply(lambda x: re.sub(r'[^\w\s]', '', x) ) 
testdata.head(5)

Unnamed: 0,textID,text
0,f87dea47db,Last session of the day httptwitpiccom67ezh
1,96d74cb729,Shanghai is also really exciting precisely s...
2,eee518ae67,Recession hit Veronique Branquinho she has to ...
3,01082688c6,happy bday
4,33987a8ee5,httptwitpiccom4w75p I like it


In [20]:
testdata.text = testdata.text.apply(
    lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords])
)

In [21]:
testdata.text = testdata.text.apply(lambda x: re.sub(r'\(?http\S+', '', x))
testdata.head(5)

Unnamed: 0,textID,text
0,f87dea47db,Last session day
1,96d74cb729,Shanghai also really exciting precisely skyscr...
2,eee518ae67,Recession hit Veronique Branquinho quit compan...
3,01082688c6,happy bday
4,33987a8ee5,like


In [22]:
testdata.text = testdata.text.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]))
testdata.head(5)

Unnamed: 0,textID,text
0,f87dea47db,Last session day
1,96d74cb729,Shanghai also really exciting precisely skyscr...
2,eee518ae67,Recession hit Veronique Branquinho quit compan...
3,01082688c6,happy bday
4,33987a8ee5,like


### Fitting test data into matrix

In [23]:
X = vectorizer.fit_transform(testdata.text)

In [24]:
nb.fit(X, testdata.text)

MultinomialNB()

# Building Model and Test

#### Spliting arrays into random train and test subsets

In [25]:
train_X, test_X, train_Y, test_Y = train_test_split(X,testdata.text ,test_size=0.3, random_state=42)
print('Training cases: %d\nTest cases: %d' % (train_X.size, test_X.size))

Training cases: 17371
Test cases: 7261


Applying MultinomialNB

In [26]:
# nb = MultinomialNB()

In [27]:
nb.fit(train_X,train_Y)

MultinomialNB()

In [28]:
y_pred = nb.predict(test_X)

In [29]:
nb.score(test_X, test_Y)

0.003770028275212064

Cause of low score
- ?