**03_Data Transformation**

<u>Importing required modules</u>

In [1]:
#Data preprocessing 
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest,f_classif

#Handling warnings
import warnings
warnings.filterwarnings("ignore")

import pickle
import os

<u>Loading data</u>

In [2]:
train = pd.read_csv("processed_train",keep_default_na=False)
test = pd.read_csv("processed_test",keep_default_na=False)

In [3]:
train.head()

Unnamed: 0,tweet,sentiment,tweet_time,user_age,Country,population,land_area,density
0,respond go,neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,sooo sad miss san diego,negative,noon,21-30,Albania,2877797,27400.0,105
2,boss bulli,negative,night,31-45,Algeria,43851044,2381740.0,18
3,interview leav alon,negative,morning,46-60,Andorra,77265,470.0,164
4,son put releas alreadi bought,negative,noon,60-70,Angola,32866272,1246700.0,26


In [4]:
test.head()

Unnamed: 0,tweet,sentiment,tweet_time,user_age,Country,population,land_area,density
0,last session day http twitpic com ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,shanghai also realli excit precis skyscrap gal...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,recess hit veroniqu branquinho quit compani shame,negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,happi bday,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,http twitpic com w p like,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


Now for sentiment analysis we only need the tweet and sentiment, so we won'nt be using other features

In [5]:
train.drop(columns=list(train.columns[2:]),inplace=True)
test.drop(columns=list(test.columns[2:]),inplace=True)

In [6]:
train.head(3)

Unnamed: 0,tweet,sentiment
0,respond go,neutral
1,sooo sad miss san diego,negative
2,boss bulli,negative


In [7]:
test.head(3)

Unnamed: 0,tweet,sentiment
0,last session day http twitpic com ezh,neutral
1,shanghai also realli excit precis skyscrap gal...,positive
2,recess hit veroniqu branquinho quit compani shame,negative


<u>Vectorization : Converting texts to vectors</u>

In [8]:
tfidf = TfidfVectorizer(ngram_range=(1,3),max_features=8000)

In [9]:
#Vectorizing train and test tweets
Vec_train_tweet = tfidf.fit_transform(train.tweet).toarray()
Vec_test_tweet = tfidf.fit_transform(test.tweet).toarray()

In [10]:
#Creating a pickle file for storing vectorizer
pickle.dump(tfidf,open("vectorizer.pkl","wb"))

In [11]:
#Checking the shapes
Vec_train_tweet.shape,Vec_test_tweet.shape

((27480, 8000), (3534, 8000))

<u>Feature selection</u>

- Linear Discriminant Analysis

In [12]:
# lda = LinearDiscriminantAnalysis()

In [13]:
# X_train_lda = lda.fit_transform(Vec_train_tweet,train.sentiment)
# X_test_lda = lda.fit_transform(Vec_test_tweet,test.sentiment)

In [14]:
# pickle.dump(lda,open("LDA.pkl","wb"))

In [15]:
# X_train_lda.shape,X_test_lda.shape

LDA NOT GIVING GOOD RESULTS 

- Select K-best

In [72]:
selector = SelectKBest(score_func=f_classif,k=50)

In [73]:
X_train_kbest = selector.fit_transform(Vec_train_tweet,train.sentiment)
X_test_kbest = selector.fit_transform(Vec_test_tweet,test.sentiment)

In [74]:
pickle.dump(selector,open("selector.pkl","wb"))

In [75]:
X_train_kbest.shape,X_test_kbest.shape

((27480, 50), (3534, 50))

In [76]:
#Saving the vectorized and reduced data
pickle.dump(X_train_kbest,open("X_train.pkl","wb"))
pickle.dump(X_test_kbest,open("X_test.pkl","wb"))

In [77]:
#Checking
pickle.load(open("X_train.pkl","rb")).shape

(27480, 50)

*Next -> 04_Modelling and training*