# Text classification using TF-IDF

### 1. Load the dataset from sklearn.datasets

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

### 2. Training data

In [4]:
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

### 3. Test data

In [5]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

###  a.  You can access the values for the target variable using .target attribute 
###  b. You can access the name of the class in the target variable with .target_names


In [6]:
twenty_train.target

array([1, 1, 3, ..., 2, 2, 2], dtype=int64)

In [7]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [8]:
twenty_train.data[0:5]

['From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n',
 "From: ani@ms.uky.edu (Aniruddha B. Deglurkar)\nSubject: help: Splitting a trimming region along a mesh \nOrganization: University Of Kentucky, Dept. of Math Sciences\nLines: 28\n\n\n\n\tHi,\n\n\tI have a problem, I hope some of the 'gurus' can help me solve.\n\n\tBackground of the probl

### 4.  Now with dependent and independent data available for both train and test datasets, using TfidfVectorizer fit and transform the training data and test data and get the tfidf features for both

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
import nltk
from nltk.stem.snowball import SnowballStemmer

In [20]:
vect = TfidfVectorizer(ngram_range=(1, 1), min_df=1)
vect.fit(twenty_train.data)
X_train_dtm = vect.transform(twenty_train.data)
X_test_dtm = vect.transform(twenty_test.data)

In [21]:
print (X_train_dtm.shape)
print (X_test_dtm.shape)

(2257, 35788)
(1502, 35788)


### 5. Use logisticRegression with tfidf features as input and targets as output and train the model and report the train and test accuracy score

In [22]:
logreg = LogisticRegression()
logreg.fit(X_train_dtm, twenty_train.target)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
y_pred_class = logreg.predict(X_test_dtm)
print (metrics.accuracy_score(twenty_test.target, y_pred_class))

0.8868175765645806


## Sentiment analysis <br> 

The objective of this problem is to perform Sentiment analysis from the tweets data collected from the users targeted at various mobile devices.
Based on the tweet posted by a user (text), we will classify if the sentiment of the user targeted at a particular mobile device is positive or not.

### 1. Read the dataset (tweets.csv) and drop the NA's while reading the dataset

In [15]:
tweets = pd.read_csv('tweets.csv',encoding = "ISO-8859-1")

In [16]:
tweets.shape

(9093, 3)

In [17]:
tweets = tweets.dropna()

In [18]:
tweets.shape

(3291, 3)

### 2. Preprocess the text and add the preprocessed text in a column with name `text` in the dataframe.

In [19]:
def preprocess(text):
    try:
        return text.decode('ascii')
    except Exception as e:
        return ""

In [20]:
tweets['text'] = [preprocess(text) for text in tweets.tweet_text]

In [21]:
tweets.dtypes

tweet_text                                            object
emotion_in_tweet_is_directed_at                       object
is_there_an_emotion_directed_at_a_brand_or_product    object
text                                                  object
dtype: object

### 3. Consider only rows having Positive emotion and Negative emotion and remove other rows from the dataframe.

In [22]:
tweets_pos_neg = tweets[(tweets.is_there_an_emotion_directed_at_a_brand_or_product=='Negative emotion') | (tweets.is_there_an_emotion_directed_at_a_brand_or_product=='Positive emotion')]

In [23]:
y=tweets_pos_neg.is_there_an_emotion_directed_at_a_brand_or_product 
X = tweets_pos_neg.tweet_text

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### 4. Represent text as numerical data using `CountVectorizer` and get the document term frequency matrix

#### Use `vect` as the variable name for initialising CountVectorizer.

In [25]:
vect = CountVectorizer()

In [26]:
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [27]:
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)

In [28]:
X_train_dtm.shape

(2393, 4919)

In [29]:
print(vect.get_feature_names()[-50:])

['yummy', 'zaarly', 'zaarlyiscoming', 'zagg', 'zaggle', 'zappos', 'zazzle', 'zazzlesxsw', 'zazzlsxsw', 'ze', 'zelda', 'zeldman', 'zero', 'zip', 'zite', 'zms', 'zombies', 'zomg', 'zone', 'zoom', 'zzzs', '¼¼', 'á¾_î¾ð', 'äá', 'å_', 'åç', 'åçwhat', 'çü', 'èï', 'ðü', 'öý', 'ù_¾', 'û_', 'ûª', 'ûªll', 'ûªm', 'ûªs', 'ûªt', 'ûï', 'ûï35', 'ûïfoursquare', 'ûïline', 'ûïspecials', 'ûïview', 'ûò', 'ûòand', 'ûó', 'ûójust', 'ûólewis', 'ûóthe']


### 5. Find number of different words in vocabulary

In [75]:
X_train_dtm.shape

(2393, 4919)

In [77]:
X_test_dtm.shape

(798, 4919)

In [24]:
len(vect.vocabulary_)

35788

#### Tip: To see all available functions for an Object use dir

### 6. Find out how many Positive and Negative emotions are there.

Hint: Use value_counts on that column

In [30]:
tweets_pos_neg['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

Positive emotion    2672
Negative emotion     519
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

### 7. Change the labels for Positive and Negative emotions as 1 and 0 respectively and store in a different column in the same dataframe named 'Label'

Hint: use map on that column and give labels

In [47]:
tweets_pos_neg['label'] = tweets_pos_neg.is_there_an_emotion_directed_at_a_brand_or_product.map({
    'Negative emotion': 0,
    'Positive emotion': 1,
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [49]:
tweets_pos_neg.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,text,label
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,,1


### 8. Define the feature set (independent variable or X) to be `text` column and `labels` as target (or dependent variable)  and divide into train and test datasets

In [50]:
y=tweets_pos_neg.label
X = tweets_pos_neg.tweet_text

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## 9. **Predicting the sentiment:**


### Use Naive Bayes and Logistic Regression and their accuracy scores for predicting the sentiment of the given text

In [52]:
def tokenize_test(vect):
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    print ('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print ('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))

In [53]:
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)

Features:  24855
Accuracy:  0.8558897243107769


In [54]:
def tokenize_test(vect):
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    print ('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = LogisticRegression()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print ('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))

In [55]:
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)

Features:  24855
Accuracy:  0.8659147869674185


## 10. Create a function called `tokenize_predict` which can take count vectorizer object as input and prints the accuracy for x (text) and y (labels)

In [60]:
def tokenize_test(vect):
    x_train_dtm = vect.fit_transform(X_train)
    print('Features: ', x_train_dtm.shape[1])
    x_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(x_train_dtm, y_train)
    y_pred_class = nb.predict(x_test_dtm)
    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))

### Create a count vectorizer function which includes n_grams = 1,2  and pass it to tokenize_predict function to print the accuracy score

In [61]:

vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)

Features:  24855
Accuracy:  0.8558897243107769


### Create a count vectorizer function with stopwords = 'english'  and pass it to tokenize_predict function to print the accuracy score

In [62]:
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

Features:  4681
Accuracy:  0.8533834586466166


### Create a count vectorizer function with stopwords = 'english' and max_features =300  and pass it to tokenize_predict function to print the accuracy score

In [64]:
vect = CountVectorizer(stop_words='english', max_features=300)
tokenize_test(vect)

Features:  300
Accuracy:  0.8107769423558897


In [65]:
print(vect.get_feature_names())

['10', '15', '2011', '2s', '6th', 'action', 'amazing', 'america', 'amp', 'android', 'app', 'apple', 'apps', 'art', 'attendees', 'austin', 'available', 'away', 'awesome', 'based', 'battery', 'begins', 'best', 'better', 'big', 'bing', 'bit', 'blackberry', 'book', 'booth', 'bought', 'brilliant', 'building', 'business', 'buy', 'buzz', 'called', 'case', 'cc', 'charger', 'check', 'choice', 'circles', 'com', 'come', 'comes', 'coming', 'company', 'conferences', 'congrats', 'congress', 'cool', 'core', 'crowd', 'day', 'days', 'demo', 'design', 'details', 'did', 'didn', 'digital', 'diller', 'does', 'doesn', 'doing', 'don', 'download', 'downtown', 'early', 'events', 'excited', 'experience', 'experts', 'facebook', 'fail', 'far', 'fast', 'fb', 'feel', 'flipboard', 'forward', 'foursquare', 'free', 'friends', 'fun', 'future', 'game', 'gave', 'geek', 'genius', 'gets', 'getting', 'giving', 'god', 'going', 'good', 'google', 'got', 'gowalla', 'gram', 'great', 'gsdm', 'gt', 'guide', 'guy', 'guys', 'hand', 

### Create a count vectorizer function with n_grams = 1,2  and max_features = 15000  and pass it to tokenize_predict function to print the accuracy score

In [67]:
vect = CountVectorizer(ngram_range=(1, 2), max_features=15000)
tokenize_test(vect)

Features:  15000
Accuracy:  0.8533834586466166


### Create a count vectorizer function with n_grams = 1,2  and include terms that appear at least 2 times (min_df = 2)  and pass it to tokenize_predict function to print the accuracy score

In [68]:
vect = CountVectorizer(ngram_range=(1, 2), min_df = 2)
tokenize_test(vect)

Features:  7764
Accuracy:  0.8583959899749374
