# Transfer Learning MNIST

* Train a simple convnet on the MNIST dataset the first 5 digits [0..4].
* Freeze convolutional layers and fine-tune dense layers for the classification of digits [5..9].

## 1. Import necessary libraries for the model

In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [0]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.utils import np_utils

import numpy as np
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
% matplotlib inline
from keras.models import Model

## 2. Import MNIST data and create 2 datasets with one dataset having digits from 0 to 4 and other from 5 to 9 

In [0]:
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [0]:
#Creating a dataset for digits 0-4
x_train_1 = x_train[y_train < 5]
y_train_1 = y_train[y_train < 5]
x_test_1 = x_test[y_test < 5]
y_test_1 = y_test[y_test < 5]

In [0]:
#Creating a dataset for digits 5-9
x_train_2 = x_train[y_train >= 5]
y_train_2 = y_train[y_train >= 5]-5
x_test_2 = x_test[y_test >= 5]
y_test_2 = y_test[y_test >= 5]-5

## 3. Print x_train, y_train, x_test and y_test for both the datasets

In [109]:
print('x_train for digits 0-4 \n',x_train_1)
print('y_train for digits 0-4 \n',y_train_1)
print('x_test for digits 0-4 \n',x_test_1)
print('y_test for digits 0-4 \n',y_test_1)

x_train for digits 0-4 
 [[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 ...

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]]
y_train for digits 0-4 
 [0 4 1 ... 2 1 3]
x_test for digits 0-4 
 [[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0

In [110]:
print('x_train for digits 5-9 \n',x_train_2)
print('y_train for digits 5-9 \n',y_train_2)
print('x_test for digits 5-9 \n',x_test_2)
print('y_test for digits 5-9 \n',y_test_2)

x_train for digits 5-9 
 [[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 ...

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]]
y_train for digits 5-9 
 [0 4 0 ... 0 1 3]
x_test for digits 5-9 
 [[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0

## ** 4. Let us take only the dataset (x_train, y_train, x_test, y_test) for Integers 0 to 4 in MNIST **
## Reshape x_train and x_test to a 4 Dimensional array (channel = 1) to pass it into a Conv2D layer

In [0]:
#reshaping
x_train_1 = x_train_1.reshape(x_train_1.shape[0], 28, 28, 1)
x_test_1 = x_test_1.reshape(x_test_1.shape[0], 28, 28, 1)

In [0]:
#reshaping
x_train_2 = x_train_2.reshape(x_train_2.shape[0], 28, 28, 1)
x_test_2 = x_test_2.reshape(x_test_2.shape[0], 28, 28, 1)

## 5. Normalize x_train and x_test by dividing it by 255

In [0]:
#Normalizing Data
x_train_1 = x_train_1.astype('float32')
x_test_1 = x_test_1.astype('float32')
x_train_1 = x_train_1 / 255.0
x_test_1 = x_test_1 / 255.0


In [0]:
#Normalizing Data
x_train_2 = x_train_2.astype('float32')
x_test_2 = x_test_2.astype('float32')
x_train_2 = x_train_2 / 255.0
x_test_2 = x_test_2 / 255.0

## 6. Use One-hot encoding to divide y_train and y_test into required no of output classes

In [0]:

y_train_2 = np_utils.to_categorical(y_train_2)
y_test_2 = np_utils.to_categorical(y_test_2)

In [0]:

y_train_1 = np_utils.to_categorical(y_train_1)
y_test_1 = np_utils.to_categorical(y_test_1)

In [117]:
 np.unique(y_train_1)
y_train_1.shape

array([0., 1.], dtype=float32)

(30596, 5)

In [118]:
np.unique(y_train_2)
y_train_2.shape

array([0., 1.], dtype=float32)

(29404, 5)

## 7. Build a sequential model with 2 Convolutional layers with 32 kernels of size (3,3) followed by a Max pooling layer of size (2,2) followed by a drop out layer to be trained for classification of digits 0-4  

In [0]:
#Initialize the model
model = Sequential()

#Add a Convolutional Layer with 32 filters of size 3X3 and activation function as 'ReLU' 
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(28,28,1)))

#Add a MaxPooling Layer of size 2X2 
model.add(MaxPooling2D(pool_size=(2, 2)))

#Apply Dropout with 0.25 probability 
model.add(Dropout(0.25))

## 8. Post that flatten the data and add 2 Dense layers with 128 neurons and neurons = output classes with activation = 'relu' and 'softmax' respectively. Add dropout layer inbetween if necessary  

In [0]:
#Flatten the layer
model.add(Flatten())

#Add Fully Connected Layer with 128 units and activation function as 'ReLU'
model.add(Dense(128, activation='relu'))

#Apply Dropout with 0.5 probability 
model.add(Dropout(0.5))

#Add Fully Connected Layer with 10 units and activation function as 'softmax'
model.add(Dense(5, activation='softmax'))

In [121]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 13, 13, 32)        0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 13, 13, 32)        0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 5408)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               692352    
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 5)                 645       
Total para

## 9. Print the training and test accuracy

In [0]:
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

In [124]:
model.fit(x_train_1, y_train_1, batch_size=40, epochs=10,validation_data=(x_test_1, y_test_1))

Train on 30596 samples, validate on 5139 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f57500aa208>

In [125]:
train_metrics = model.evaluate(x_train_1, y_train_1)
print('Train loss:', train_metrics[0])
print('Train accuracy:', train_metrics[1])

Train loss: 0.0013721625391915437
Train accuracy: 0.9996731598901817


In [126]:
test_metrics = model.evaluate(x_test_1, y_test_1)
print('Test loss:', test_metrics[0])
print('Test accuracy:', test_metrics[1])

Test loss: 0.0061140594181627456
Test accuracy: 0.9980540961276513


## 10. Make only the dense layers to be trainable and convolutional layers to be non-trainable

In [127]:
model.layers

[<keras.layers.convolutional.Conv2D at 0x7f57504fc6a0>,
 <keras.layers.pooling.MaxPooling2D at 0x7f57504e4588>,
 <keras.layers.core.Dropout at 0x7f57504fc278>,
 <keras.layers.core.Flatten at 0x7f57504fc668>,
 <keras.layers.core.Dense at 0x7f57504fc860>,
 <keras.layers.core.Dropout at 0x7f5750222a58>,
 <keras.layers.core.Dense at 0x7f57501da320>]

In [0]:
model_trans = Model(inputs=model.input, outputs=model.get_layer('flatten_3').output )
#model = Model(inputs=base_model.input, outputs=base_model.get_layer('block4_pool').output )
model_trans.trainable = False

In [130]:
model_trans.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3_input (InputLayer)  (None, 28, 28, 1)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 13, 13, 32)        0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 13, 13, 32)        0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 5408)              0         
Total params: 320
Trainable params: 0
Non-trainable params: 320
_________________________________________________________________


## 11. Use the model trained on 0 to 4 digit classification and train it on the dataset which has digits 5 to 9  (Using Transfer learning keeping only the dense layers to be trainable)

In [0]:
x_train_2_trans = model_trans.predict(x_train_2)

In [0]:
x_test_2_trans = model_trans.predict(x_test_2)

In [133]:
x_train_1.shape

(30596, 28, 28, 1)

In [134]:
x_train_1_trans.shape

(30596, 5408)

In [136]:
y_train_1.shape

(30596, 5)

In [0]:
model_transfer = Sequential()
#model_transfer.add(Dense(128, input_dim=512, activation='relu'))
#model_transfer.add(Dropout(0.25))
#model_transfer.add(Dense(64, activation='relu'))
#model_transfer.add(Dense(10, activation='softmax'))
model_transfer.add(Dense(5, input_dim=5408, activation='softmax'))

In [138]:
model_transfer.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 5)                 27045     
Total params: 27,045
Trainable params: 27,045
Non-trainable params: 0
_________________________________________________________________


In [139]:
model_transfer.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
model_transfer.fit(x_train_2_trans, y_train_2, batch_size=40, epochs=10,validation_data=(x_test_2_trans, y_test_2))#

Train on 29404 samples, validate on 4861 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5735c24f60>

## 12. Print the accuracy for classification of digits 5 to 9

In [142]:
test_metrics = model_transfer.evaluate(x_test_2_trans, y_test_2)
print('Test loss:', test_metrics[0])
print('Test accuracy:', test_metrics[1])

Test loss: 0.03228350418296598
Test accuracy: 0.9903312075704588


(30596, 5)

## Sentiment analysis <br> 

The objective of the second problem is to perform Sentiment analysis from the tweets data collected from the users targeted at various mobile devices.
Based on the tweet posted by a user (text), we will classify if the sentiment of the user targeted at a particular mobile device is positive or not.

### 13. Read the dataset (tweets.csv) and drop the NA's while reading the dataset

In [0]:
import pandas as pd

In [145]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
tweet = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/LAB and Assignment 8th/Lab Internal/tweets.csv',encoding = 'ISO-8859-1').dropna(axis = 0)

In [195]:
tweet.shape

(3291, 3)

In [0]:
data = tweet

In [198]:
tweet.columns

Index(['tweet_text', 'emotion_in_tweet_is_directed_at',
       'is_there_an_emotion_directed_at_a_brand_or_product'],
      dtype='object')

### 14. Preprocess the text and add the preprocessed text in a column with name `text` in the dataframe.

In [0]:
def preprocess(text):
    try:
        return text.encode().decode('ascii')
    except Exception as e:
        return ""

In [0]:
data['text'] = [preprocess(text) for text in data.tweet_text]

### 15. Consider only rows having Positive emotion and Negative emotion and remove other rows from the dataframe.

In [201]:
data.head()


Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,.@wesley83 I have a 3G iPhone. After 3 hrs twe...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,@jessedee Know about @fludapp ? Awesome iPad/i...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,@swonderlin Can not wait for #iPad 2 also. The...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,@sxsw I hope this year's festival isn't as cra...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,@sxtxstate great stuff on Fri #SXSW: Marissa M...


In [202]:
data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()


Positive emotion                      2672
Negative emotion                       519
No emotion toward brand or product      91
I can't tell                             9
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [0]:
data = data[data['is_there_an_emotion_directed_at_a_brand_or_product'].isin(['Negative emotion','Positive emotion'])]

In [204]:
data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

Positive emotion    2672
Negative emotion     519
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

### 16. Represent text as numerical data using `CountVectorizer` and get the document term frequency matrix

#### Use `vect` as the variable name for initialising CountVectorizer.

In [0]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

In [0]:
vect = CountVectorizer()

In [209]:
vect.fit(data["text"])
X_features = vect.transform(data["text"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

### 17. Find number of different words in vocabulary

In [218]:
X_features.shape # 5482 words in vaocabulary
vect.vocabulary_

(3191, 5482)

{'wesley83': 5291,
 'have': 2224,
 '3g': 77,
 'iphone': 2573,
 'after': 216,
 'hrs': 2363,
 'tweeting': 5025,
 'at': 402,
 'rise_austin': 4042,
 'it': 2595,
 'was': 5248,
 'dead': 1252,
 'need': 3218,
 'to': 4896,
 'upgrade': 5108,
 'plugin': 3610,
 'stations': 4507,
 'sxsw': 4659,
 'jessedee': 2624,
 'know': 2714,
 'about': 143,
 'fludapp': 1857,
 'awesome': 459,
 'ipad': 2563,
 'app': 334,
 'that': 4801,
 'you': 5452,
 'll': 2851,
 'likely': 2821,
 'appreciate': 354,
 'for': 1885,
 'its': 2597,
 'design': 1312,
 'also': 269,
 'they': 4824,
 're': 3861,
 'giving': 2045,
 'free': 1916,
 'ts': 4994,
 'swonderlin': 4650,
 'can': 781,
 'not': 3278,
 'wait': 5220,
 'should': 4273,
 'sale': 4095,
 'them': 4810,
 'down': 1458,
 'hope': 2336,
 'this': 4838,
 'year': 5435,
 'festival': 1791,
 'isn': 2590,
 'as': 393,
 'crashy': 1147,
 'sxtxstate': 4683,
 'great': 2121,
 'stuff': 4572,
 'on': 3350,
 'fri': 1921,
 'marissa': 2985,
 'mayer': 3016,
 'google': 2084,
 'tim': 4871,
 'reilly': 3931,
 

#### Tip: To see all available functions for an Object use dir

In [214]:
dir(vect)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_sort_features',
 '_stop_words_id',
 '_validate_custom_analyzer',
 '_validate_params',
 '_validate_vocabulary',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'build_tokenizer',
 'decode',
 'decode_error',
 'dtype',
 'encoding',
 'fit',
 'fit_transform',
 'fixed_vocabulary_',
 'get_feature_names',
 'get_params',
 'get_stop_words',
 'input',
 'inverse_transf

### 18. Find out how many Positive and Negative emotions are there.

Hint: Use value_counts on that column

In [219]:
data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

Positive emotion    2672
Negative emotion     519
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

### 19. Change the labels for Positive and Negative emotions as 1 and 0 respectively and store in a different column in the same dataframe named 'Label'

Hint: use map on that column and give labels

In [229]:
data["is_there_an_emotion_directed_at_a_brand_or_product"] = data["is_there_an_emotion_directed_at_a_brand_or_product"].astype("category")

data["Label"] = data["is_there_an_emotion_directed_at_a_brand_or_product"].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [231]:
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,text,Label
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,@jessedee Know about @fludapp ? Awesome iPad/i...,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,@swonderlin Can not wait for #iPad 2 also. The...,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,@sxsw I hope this year's festival isn't as cra...,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,@sxtxstate great stuff on Fri #SXSW: Marissa M...,1


### 20. Define the feature set (independent variable or X) to be `text` column and `labels` as target (or dependent variable)  and divide into train and test datasets

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features, data["Label"], random_state=2)

## 21. **Predicting the sentiment:**


### Use Naive Bayes and Logistic Regression and their accuracy scores for predicting the sentiment of the given text

In [0]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [235]:
# train the model using X_train_dtm
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
# make class predictions 
y_pred_class = nb.predict(X_test)

In [239]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.8558897243107769

In [0]:
#Logistic
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()

In [244]:
lg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
y_pred_class_lg = lg.predict(X_test)

In [246]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class_lg)

0.868421052631579

## 22. Create a function called `tokenize_predict` which can take count vectorizer object as input and prints the accuracy for x (text) and y (labels)

In [0]:
def tokenize_predict(vect):
    x_train_dtm = vect.fit_transform(X_train_1)
    print('Features: ', x_train_dtm.shape[1])
    x_test_dtm = vect.transform(X_test_1)
    nb = MultinomialNB()
    nb.fit(x_train_dtm, y_train_1)
    y_pred_class = nb.predict(x_test_dtm)
    print('Accuracy: ', metrics.accuracy_score(y_test_1, y_pred_class))

### Create a count vectorizer function which includes n_grams = 1,2  and pass it to tokenize_predict function to print the accuracy score

In [257]:
vect = CountVectorizer(ngram_range=(1,2))

from sklearn.model_selection import train_test_split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(data["text"], data["Label"], random_state=2)
tokenize_predict(vect)

Features:  23881
Accuracy:  0.8709273182957393


### Create a count vectorizer function with stopwords = 'english'  and pass it to tokenize_predict function to print the accuracy score

In [260]:
vect = CountVectorizer(stop_words='english')

from sklearn.model_selection import train_test_split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(data["text"], data["Label"], random_state=2)
tokenize_predict(vect)

Features:  4531
Accuracy:  0.8634085213032582


### Create a count vectorizer function with stopwords = 'english' and max_features =300  and pass it to tokenize_predict function to print the accuracy score

In [261]:
vect = CountVectorizer(stop_words='english', max_features=300)

from sklearn.model_selection import train_test_split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(data["text"], data["Label"], random_state=2)
tokenize_predict(vect)

Features:  300
Accuracy:  0.8245614035087719


### Create a count vectorizer function with n_grams = 1,2  and max_features = 15000  and pass it to tokenize_predict function to print the accuracy score

In [263]:
vect = CountVectorizer(ngram_range=(1,2),stop_words='english', max_features=15000)

from sklearn.model_selection import train_test_split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(data["text"], data["Label"], random_state=2)
tokenize_predict(vect)

Features:  15000
Accuracy:  0.87468671679198


### Create a count vectorizer function with n_grams = 1,2  and include terms that appear at least 2 times (min_df = 2)  and pass it to tokenize_predict function to print the accuracy score

In [265]:
vect = CountVectorizer(ngram_range=(1,2),min_df = 2)

from sklearn.model_selection import train_test_split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(data["text"], data["Label"], random_state=2)
tokenize_predict(vect)

Features:  7582
Accuracy:  0.8771929824561403
