In [None]:
import pandas as pd
import string
import unicodedata
import sys
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import sklearn.model_selection as cv
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
plt.style.use("fivethirtyeight")
%matplotlib inline

In [None]:
def readCSV(filename, encoded=False):
    if encoded:
        return pd.read_csv(filename, encoding = "ISO-8859-1", header=None)
    else:
        return pd.read_csv(filename)

In [None]:
smileyfaces = [':-)', ':)', ':D', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)']
sadfaces = ['>:[', ':-(', ':(', ':-c', ':c', ':-<', ':<', ':-[', ':[', ':{', '=(','=[', 'D:']
angryfaces = ['>:(', '(╯°□°)╯︵ ┻━┻']
cryingfaces = [":’-(", ":’("]
skepticalfaces = ['>:', '>:/', ':-/', '=/',':L', '=L', ':S', '>.<']
noexpressionfaces = [':|', ':-|', '(｀・ω・´)']
surprisedfaces = ['>:O', ':-O', ':O', ':-o', ':o', '8O', 'O_O', 'o-o', 'O_o', 'o_O', 'o_o', 'O-O']

In [None]:
def cleanText(wordSeries):
    def remove_punctuation(x):
        for char in string.punctuation:
            x = x.replace(char, ' ')
        return x
    for smile in smileyfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(smile, ' smileyface '))
    for sad in sadfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(sad,' sadface '))
    for angry in angryfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(angry, ' angryface '))
    for cry in cryingfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(cry, ' cryingface '))
    for skeptical in skepticalfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(skeptical, ' skepticalface '))
    for noexp in noexpressionfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(noexp, ' noexpressionfaces '))
    for surprised in surprisedfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(surprised, ' surprisedface '))
    wordSeries = wordSeries.apply(lambda x: x.replace('...', ' dotdotdot '))
    wordSeries = wordSeries.apply(lambda x: x.replace('!', ' exclamatory '))
    wordSeries = wordSeries.apply(lambda x: remove_punctuation(x))
    wordSeries = wordSeries.apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
    wordSeries = wordSeries.apply(lambda x: x.lower())
    #wordSeries = wordSeries.apply(lambda x: x.replace('http', ' '))
    wordSeries = wordSeries.apply(lambda x: ' '.join( [w for w in x.split() if len(w)>1] ))
    return wordSeries

In [None]:
def cleanTextU(wordSeries):
    tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                          if unicodedata.category(chr(i)).startswith('P'))
    def remove_punctuation(text):
        return text.translate(tbl)
    for smile in smileyfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(smile, ' smileyface '))
    for sad in sadfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(sad,' sadface '))
    for angry in angryfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(angry, ' angryface '))
    for cry in cryingfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(cry, ' cryingface '))
    for skeptical in skepticalfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(skeptical, ' skepticalface '))
    for noexp in noexpressionfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(noexp, ' noexpressionfaces '))
    for surprised in surprisedfaces:
        wordSeries = wordSeries.apply(lambda x: x.replace(surprised, ' surprisedface '))
    wordSeries = wordSeries.apply(lambda x: x.replace('...', ' dotdotdot '))
    wordSeries = wordSeries.apply(lambda x: x.replace('!', ' exclamatory '))
    wordSeries = wordSeries.apply(lambda x: remove_punctuation(x))
    wordSeries = wordSeries.apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
    wordSeries = wordSeries.apply(lambda x: x.lower())
    wordSeries = wordSeries.apply(lambda x: x.replace('<br >',' '))
    wordSeries = wordSeries.apply(lambda x: x.replace('<br>',' '))
    wordSeries = wordSeries.apply(lambda x: x.replace('`',''))
    wordSeries = wordSeries.apply(lambda x: x.replace(' id ', ' '))
    wordSeries = wordSeries.apply(lambda x: x.replace(' im ', ' '))
    #wordSeries = wordSeries.apply(lambda x: x.replace('http', ' '))
    wordSeries = wordSeries.apply(lambda x: ' '.join( [w for w in x.split() if len(w)>1] ))
    return wordSeries

In [None]:
def tokenize(documents, unicode):
    if unicode:
        documents = cleanTextU(documents)
    else:
        documents = cleanText(documents)
    docs = [word_tokenize(content) for content in documents]
    stopwords_=set(stopwords.words('english'))
    def filter_tokens(sent):
        return([w for w in sent if not w in stopwords_])
    docs=list(map(filter_tokens,docs))
    lemmatizer = WordNetLemmatizer()
    docs_lemma = [[lemmatizer.lemmatize(word) for word in words] for words in docs]
    return docs_lemma

In [None]:
def createTFIDF(data, contentCol, encoded = False):
    data['Tokens'] = tokenize(data[contentCol], encoded)
    data['Tokens'] = data['Tokens'].apply(lambda x: ' '.join(x))
    corpus = [row for row in data['Tokens']]
    tfidf = TfidfVectorizer()
    document_tfidf_matrix = tfidf.fit_transform(corpus)
    return tfidf, document_tfidf_matrix

In [None]:
def getLabel(data, label):
    return data[label]

In [None]:
def createRegressor(X,y):
    lg = LogisticRegression(max_iter = 1000)
    lg.fit(X,y)
    return lg

In [None]:
def useTFIDF(data, contentCol, tfidf, encoded = False):
    data['Tokens'] = tokenize(data[contentCol], encoded)
    data['Tokens'] = data['Tokens'].apply(lambda x: ' '.join(x))
    corpus = [row for row in data['Tokens']]
    document_tfidf_matrix = tfidf.transform(corpus)
    return document_tfidf_matrix

In [None]:
def addPolarity(data, model, X):
    data['polarity'] = model.predict(X)

In [None]:
def splitPositiveNegative(data):
    positiveS = ['enthusiasm', 'neutral', 'surprise', 'love', 'fun', 'happiness', 'relief']
    negativeS = ['empty', 'sadness', 'neutral', 'worry', 'hate', 'boredom', 'anger']
    dataP = data[data['sentiment'].isin(positiveS)]
    dataN = data[data['sentiment'].isin(negativeS)]
    dataP['Tokens'] = tokenize(dataP['content'], False)
    dataN['Tokens'] = tokenize(dataN['content'], False)