In [None]:
# This may require a runtime restart in order to work
!pip install 'pandas==1.3.0'

In [None]:
!pip install config

In [None]:
!pip install 'tweepy==4.4.0'

In [None]:
!pip install pyspellchecker

In [None]:
# Standard Packages
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import re
from wordcloud import WordCloud

# NLTKPackages
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords

# Nearal Network packages
#import keras
import tensorflow as tf
from sklearn import preprocessing, model_selection
#from keras.models import Sequential, load_model 
#from keras.layers import Dense, Dropout, Activation
from tensorflow.keras.callbacks import ModelCheckpoint

# Sklearn pacckages 
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mike\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mike\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mike\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#Loading the dataset for mac users
FILE_PATH = '/content/drive/MyDrive/ML/GBC/DL1/Project/'
RANDOM_SEED = 42

# Load the data set
data = pd.read_csv(
    FILE_PATH + 'sentiment_analysis_dataset.csv',
    sep=',',
    on_bad_lines='skip',
    encoding='latin-1'
    )

print(data.shape)
data.head(5)

In [None]:
# Loading the dataset for PC users
data = pd.read_csv('sentiment analysis dataset.csv', sep=',', on_bad_lines='skip', encoding='latin-1')
print(data.shape)
data.head(5)

(1578612, 4)


Unnamed: 0,ï»¿ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


In [None]:
# check for null data
data.isna().sum()

ï»¿ItemID          0
Sentiment          0
SentimentSource    0
SentimentText      0
dtype: int64

In [None]:
# The Item ID column is not useful for us, drop it
# We also tested using 1/3 of the dataset and ran out of memory, 1/4 seems to be our limit
data = data.drop(['ï»¿ItemID'], axis=1)
data = data.sample(frac=1/15).reset_index(drop=True)
data.shape

(105241, 3)

In [None]:
data.head()

Unnamed: 0,Sentiment,SentimentSource,SentimentText
0,0,Sentiment140,@joelMadden Im up too. I can't sleeeeeeeep! An...
1,0,Sentiment140,work at 8am..no fun
2,1,Sentiment140,@PursuitBrooke Are you more of a Summer person...
3,0,Sentiment140,wants to smoke a cig but doesn't have a lighter.
4,0,Sentiment140,what pass?? there are lots of twits but we don...


## Text Cleaner \# 1

Using spell checking and lemmatizing. These are very slow functions, so can't be done to quickly process the whole dataset.

In [None]:
# Clean text
import re
from string import punctuation
from collections import Counter

from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

def clean_text_for_tfidf_vectorizer(text):
    spell = SpellChecker()
    lemmatizer = WordNetLemmatizer()
    tk = TweetTokenizer()

    stopword_list = stopwords.words('english')
    new_stop_words=['i', 'im', 'http', 'ive', 'rt']
    for i in new_stop_words:
        stopword_list.append(i)

    cleaned_text = []
    punctuation_counts = []

    for sentence in text:
        cleaned_words = []

        punctuation_count = lambda l1,l2: sum([1 for x in l1 if x in l2])
        punctuation_counts.append(punctuation_count(sentence,set(punctuation)))

        for word in tk.tokenize(sentence):
            # Spell check
            word = spell.correction(word.lower())

            # Remove stop words
            if word in stopword_list:
              continue

            # Remove numbers and punctuation
            word = re.sub('[^a-zA-Z]+', '', word)

            if (word == ''):
              continue

            # Lemmatize
            word = lemmatizer.lemmatize(word)

            cleaned_words.append(word)

        cleaned_text.append(' '.join(cleaned_words))
      
    return cleaned_text, punctuation_counts

# Checking the run time of the script on a small subset of the dataset
text = data['SentimentText']
text = text.loc[1:500]

print('cleaned text started')
cleaned_text, punctuation_counts = %time clean_text_for_tfidf_vectorizer(text)
print('cleaned text completed')
 
tfidf_vectorizer = %time TfidfVectorizer(ngram_range=(1,3))

print('tfidf vectorizer started')
tfidf_vect_w_mat = %time tfidf_vectorizer.fit_transform(cleaned_text)
print('tfidf vectorizer completed')

tfidf_feature_names = %time tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame (tfidf_vect_w_mat.todense())

print('column names started')
%time tfidf_df.columns = tfidf_feature_names # This may be a very expensive operation, consider commenting out
print('column names completed')

tfidf_df['punctuation_count'] = punctuation_counts

cleaned text started
Wall time: 1min 53s
cleaned text completed
Wall time: 0 ns
tfidf vectorizer started
Wall time: 14 ms
tfidf vectorizer completed
Wall time: 1.02 ms
column names started
Wall time: 0 ns
column names completed


We tried this function to clean the dataset and benchmarked it's performance.  Only 500 tweets took almost 2 minutes, checked to see if we can reduce the time

## Text Cleaner \# 2

After testing, we discovered that the truly computationally expensive function was the spell checker.  We will be creating 2 versions of the database, one with spell checker and one without, so we can move forward with model building
while the spell checker compiles.

In [None]:
import re
from string import punctuation
from collections import Counter
from nltk.corpus import stopwords

from spellchecker import SpellChecker
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

stopwordlist = stopwords.words('english')
new_stop_words=['i', 'im', 'http', 'ive', 'rt']
for i in new_stop_words:
    stopwordlist.append(i)


text = data['SentimentText']
sentiment = data['Sentiment']


def preprocess(textdata):
    processedText = []
    punctuation_counts = []

    # Create Lemmatizer and Stemmer.
    wordstem = PorterStemmer()
    spell = SpellChecker()
    tk = TweetTokenizer()
    
    for tweet in textdata:
        tweet = re.sub(r'@([A-Za-z0-9_]+)', '', tweet)
        punctuation_count = lambda l1,l2: sum([1 for x in l1 if x in l2])
        punctuation_counts.append(punctuation_count(tweet,set(punctuation)))
        tweet = tweet.lower()
        
        # Regex
        tweet = re.sub('[^a-zA-Z]+', ' ', tweet)
        sequencePattern   = r"(.)\1\1+"
        seqReplacePattern = r"\1\1"
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
        
        tweetwords = ''
        for word in tk.tokenize(tweet):
            # Checking if the word is a stopword.
            if word not in stopwordlist:
                # Spell check the word
                word = spell.correction(word)
                # Lemmatizing the word.
                word = wordstem.stem(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText, punctuation_counts

print('Starting Function')
processedText, punctuation_counts = %time preprocess(text)
print('Ending Function')

cleaned_text = pd.DataFrame(data=processedText, columns = ['tweet'])
cleaned_text['punctuation_count'] = punctuation_counts
cleaned_text['sentiment'] = sentiment

#Created a .csv for review
cleaned_text.to_csv('cleaned_text.csv')



Starting Function
Wall time: 38min 35s
Ending Function
