# Building word frequencies

##  Import functions and data

In [168]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
# Summary
from sklearn import datasets

from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# from sklearn.preprocessing import Imputer
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

from scipy.stats import randint

# tensor-Keras
import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from tensorflow.keras import utils
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools



import warnings
import scipy.io
print("TensorFlow version: ", tf.__version__)

warnings.filterwarnings('ignore')

TensorFlow version:  2.2.0


# Load Data

In [169]:
# import data 
tweets_train= pd.read_table('C:/Users/rzouga/Downloads/Github/NLP/train_tweets.txt', '\t',header=None)
tweets_train = tweets_train[0].str.split(",", n = 1, expand = True) 
tweets_train.columns = ["sentiment", "tweets"]
tweets_test =pd.read_table('C:/Users/rzouga/Downloads/Github/NLP/test_tweets.txt', '\t',header=None)
tweets_test = tweets_test[0].str.split(",", n = 1, expand = True) 
tweets_test.columns = ["sentiment", "tweets"]
# Print the head of df
tweets_train.head(3)

Unnamed: 0,sentiment,tweets
0,positive,Gas by my house hit $3.39!!!! I'm going to Ch...
1,negative,"Theo Walcott is still shit, watch Rafa and Jo..."
2,negative,"its not that I'm a GSP fan, i just hate Nick ..."


# Tweet Processing

In [170]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

# Process tweet

The given function process_tweet() tokenizes the tweet into individual words, removes stop words and applies stemming.


In [171]:
# test the function below
print('This is an example of a positive tweet: \n', tweets_train['tweets'][0])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(tweets_train['tweets'][0]))

This is an example of a positive tweet: 
  Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)

This is an example of the processed version of the tweet: 
 ['ga', 'hous', 'hit', '39', "i'm", 'go', 'chapel', 'hill', 'sat', ':)']


# Target Encoder

In [187]:
tweets_test['tweets'].tail()

1039     #WEB YouTube improves upload process with opt...
1040     Gonna change my Tumblr theme. I hope I can fi...
1041     I'm so jealous of everyone at the Justin Bieb...
1042     Jim Harbaugh, Alex Smith Drive Giants World S...
1043     #Trending: Tim Tebow is now dating cave woman...
Name: tweets, dtype: object

In [182]:
from sklearn.preprocessing import OneHotEncoder
X_train= tweets_train['tweets']# the features we want to analyze
X_test=tweets_test['tweets']
y_train=tweets_train['sentiment']
y_test =tweets_test['sentiment']

#### Create the encoder.
encoder = LabelEncoder()

encoder.fit(y_train)   # Assume for simplicity all features are categorical.

y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

print("y_train",y_train.shape)
print("y_test",y_test.shape)


y_train (6588,)
y_test (1044,)


# Feature Extraction

## Building word frequencies

 In our goal of tweet sentiment analysis, this function will build a dictionary where we can lookup how many times a word appears in the lists of positive or negative tweets. This will be very helpful when extracting the features of the dataset .

## Word frequency dictionary

Now that we know the building blocks, let's finally take a look at the build_freqs() function . This is the function that creates the dictionary containing the word counts from each corpus.


In [173]:
def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 , 1,2)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    # yslist = np.squeeze(ys).tolist()
    yslist = np.squeeze(ys)

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    #for y, tweet in zip(yslist, tweets):
        #for word in process_tweet(tweet):
            #pair = (word, y)
            #if pair in freqs:
                #freqs[pair] += 1
            #else:
                #freqs[pair] = 1
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            freqs[pair] = freqs.get(pair, 0) + 1

    return freqs

In [174]:
# create frequency dictionary
freqs = build_freqs(X_train, y_train)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 19229


In [175]:
print(freqs)

{('ga', 2): 1, ('hous', 2): 33, ('hit', 2): 16, ('39', 2): 1, ("i'm", 2): 149, ('go', 2): 267, ('chapel', 2): 4, ('hill', 2): 11, ('sat', 2): 38, (':)', 2): 165, ('theo', 0): 1, ('walcott', 0): 1, ('still', 0): 34, ('shit', 0): 33, ('watch', 0): 55, ('rafa', 0): 2, ('johnni', 0): 5, ('deal', 0): 10, ('saturday', 0): 36, ("i'm", 0): 84, ('gsp', 0): 2, ('fan', 0): 7, ('hate', 0): 17, ('nick', 0): 6, ('diaz', 0): 2, ("can't", 0): 37, ('wait', 0): 28, ('februari', 0): 8, ('iranian', 0): 1, ('gener', 0): 1, ('say', 0): 37, ("israel'", 0): 2, ('iron', 0): 3, ('dome', 0): 1, ('missil', 0): 1, ('keep', 0): 9, ('talk', 0): 17, ('like', 0): 74, ('may', 0): 120, ('end', 0): 15, ('find', 0): 7, ('j', 2): 7, ('davlar', 2): 1, ('11th', 2): 6, ('main', 2): 9, ('rival', 2): 2, ('team', 2): 44, ('poland', 2): 4, ('hope', 2): 115, ('make', 2): 98, ('success', 2): 9, ('end', 2): 24, ('tough', 2): 3, ('week', 2): 45, ('train', 2): 8, ('tomorrow', 2): 479, ("act'", 0): 1, ("sat'", 0): 3, ('decid', 0): 6, (

In [176]:
print(freqs.get(('hate',0)))
print(freqs.get(('hate',1)))
print(freqs.get(('hate',2)))
print(encoder.classes_)

17
5
4
['negative' 'neutral' 'positive']


###  Extracting the features + Modeling 

Given a list of tweets, extract the features and store them in a matrix. You will extract 3 features.

        The first feature is the number of negative  words in a tweet.
        The second feature is the number of neutral words in a tweet.
        The third feature is the number of positive words in a tweet.
        
Then train your  classifier on these features.

Test the classifier on a validation set.


In [177]:
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,4)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    # 4 elements in the form of a 1 x 4 vector
    x = np.zeros((1, 4)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the negative label 0
        x[0,1] += freqs.get((word, 0.0),0)
        # increment the word count for the neutral label 1
        x[0,2] += freqs.get((word, 1.0),0)
        
        # increment the word count for the positive label 2
        x[0,3] += freqs.get((word, 2.0),0)
        
    
    assert(x.shape == (1, 4))
    return x

In [178]:
# Check your function

# test 1
# test on training data
tmp1 = extract_features(X_train[0], freqs)
print(encoder.classes_)
print(tmp1)
print(X_train[0])

['negative' 'neutral' 'positive']
[[  1. 270. 634. 685.]]
 Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)


# Prepare  X_train_Freq

In [179]:
# collect the features 'x' and stack them into a matrix 'X'
X_train_Freq = np.zeros((len(X_train), 4))
for i in range(len(X_train)):
    X_train_Freq[i, :]= extract_features(X_train[i], freqs)

# training labels corresponding to X
print(X_train_Freq.shape)
print(y_train.shape)

(6588, 4)
(6588,)


# Prepare X_test_Freq

In [189]:
# collect the features 'x' and stack them into a matrix 'X'
X_test_Freq = np.zeros((len(X_test), 4))
for i in range(len(X_test)):
    X_test_Freq[i, :]= extract_features(X_test[i], freqs)

# training labels corresponding to X
print( X_test_Freq.shape)
print(y_test.shape)

(1044, 4)
(1044,)


# Machine Learning-Based Approaches

# RandomForestClassifier

In [206]:
from sklearn.ensemble import RandomForestClassifier as RFC
classifier = RFC()
classifier.fit(X_train_Freq,y_train)

y_pred = classifier.predict(X_train_Freq)
y_pred_class=classifier.predict(X_test_Freq)
print('Train accuracy score:',accuracy_score(y_train,y_pred))
print('Test accuracy score:', accuracy_score(y_test,classifier.predict(X_test_Freq)))
# f1: 2 tp / (2 tp + fp + fn)
f1 = classification_report(y_test, y_pred_class,target_names=encoder.classes_)
print('classification_report:',  f1)

Train accuracy score: 0.9963570127504554
Test accuracy score: 0.5421455938697318
classification_report:                precision    recall  f1-score   support

    negative       0.37      0.27      0.31       193
     neutral       0.57      0.67      0.62       466
    positive       0.57      0.52      0.54       385

    accuracy                           0.54      1044
   macro avg       0.50      0.49      0.49      1044
weighted avg       0.53      0.54      0.53      1044



# KNeighborsClassifier

In [209]:
from sklearn.neighbors import KNeighborsClassifier
classifer = KNeighborsClassifier(1)
classifier.fit(X_train_Freq,y_train)
y_pred = classifier.predict(X_train_Freq)
y_pred_class=classifier.predict(X_test_Freq)
print('Train accuracy score:',accuracy_score(y_train,y_pred))
print('Test accuracy score:', accuracy_score(y_test,classifier.predict(X_test_Freq)))
# f1: 2 tp / (2 tp + fp + fn)
f1 = classification_report(y_test, y_pred_class,target_names=encoder.classes_)
print('classification_report: ',  f1)

Train accuracy score: 0.9965088038858531
Test accuracy score: 0.5402298850574713
classification_report:                precision    recall  f1-score   support

    negative       0.37      0.25      0.30       193
     neutral       0.56      0.67      0.61       466
    positive       0.57      0.53      0.55       385

    accuracy                           0.54      1044
   macro avg       0.50      0.48      0.49      1044
weighted avg       0.53      0.54      0.53      1044



# SVM

In [210]:
from sklearn.svm import SVC
classifer = KNeighborsClassifier(1)
classifier.fit(X_train_Freq,y_train)
y_pred = classifier.predict(X_train_Freq)
y_pred_class=classifier.predict(X_test_Freq)
print('Train accuracy score:',accuracy_score(y_train,y_pred))
print('Test accuracy score:', accuracy_score(y_test,classifier.predict(X_test_Freq)))
# f1: 2 tp / (2 tp + fp + fn)
f1 = classification_report(y_test, y_pred_class,target_names=encoder.classes_)
print('classification_report: ',  f1)

Train accuracy score: 0.9965088038858531
Test accuracy score: 0.539272030651341
classification_report:                precision    recall  f1-score   support

    negative       0.36      0.26      0.30       193
     neutral       0.57      0.67      0.61       466
    positive       0.57      0.52      0.54       385

    accuracy                           0.54      1044
   macro avg       0.50      0.48      0.49      1044
weighted avg       0.53      0.54      0.53      1044

