# IMPORTS

In [1]:
import csv
import preprocessor as pr
import pandas as pd
from langdetect import detect
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.base import TransformerMixin
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/eugenia_bogacheva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/eugenia_bogacheva/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/eugenia_bogacheva/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/eugenia_bogacheva/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# PREPROCESSING

## Reading the data

In [2]:
test_topics = []
test_sentiment = []
test_text = []
with open('test.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        test_topics.append(row["Topic"])
        test_sentiment.append(row["Sentiment"])
        test_text.append(row["TweetText"])

In [3]:
train_topics = []
train_sentiment = []
train_text = []
with open('train.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        train_topics.append(row["Topic"])
        train_sentiment.append(row["Sentiment"])
        train_text.append(row["TweetText"])

## Examples of tweet text

In [4]:
for i in [0,111,333,555,888]:
    print(train_text[i])

Now all @Apple has to do is get swype on the iphone and it will be crack. Iphone that is
RT @katebetts: Another great James Stewart story in today's NY Times about importance of architecture in @apple retail success http://t. ...
Pissed with whoever designs keyboards with @apple for not having a home and end key.  working on the CLI i use those keys often
In front of the @apple store. So many blue shirts I feel like I'm at a smurf reunion. 
iPadのビジネス活用セミナー@Appleストア銀座なう。イシン（株）の高木さんを見にきたよ。


## Dropping irrelevant

In [33]:
train_sentiment_without_irrelevant = []
train_topics_without_irrelevant = []
train_text_without_irrelevant = []

for i in range(len(train_sentiment)):
    if train_sentiment[i] != 'irrelevant':
        train_sentiment_without_irrelevant.append(train_sentiment[i])
        train_topics_without_irrelevant.append(train_topics[i])
        train_text_without_irrelevant.append(train_text[i])

In [34]:
test_sentiment_without_irrelevant = []
test_topics_without_irrelevant = []
test_text_without_irrelevant = []

for i in range(len(test_sentiment)):
    if test_sentiment[i] != 'irrelevant':
        test_sentiment_without_irrelevant.append(test_sentiment[i])
        test_topics_without_irrelevant.append(test_topics[i])
        test_text_without_irrelevant.append(test_text[i])

## Data cleaning

In [7]:
def preprocess(texts):
    documents = []
    pr.set_options(pr.OPT.URL, pr.OPT.HASHTAG, pr.OPT.MENTION)
    stemmer = WordNetLemmatizer()

    for sen in range(0, len(texts)):
        # remove URLs, hashtags and mentions
        document = pr.clean(texts[sen])

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()

        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)

        documents.append(document)
        
    return documents

In [35]:
train_text_cleaned = preprocess(train_text)
test_text_cleaned = preprocess(test_text)

In [36]:
train_text_without_irrelevant_cleaned = preprocess(train_text_without_irrelevant)
test_text_without_irrelevant_cleaned = preprocess(test_text_without_irrelevant)

# AUXILARY FUNCTIONS

## Function for calculating the confusion matrix

In [60]:
def print_confusion_matrix(predictions, true_labels):
    print(confusion_matrix(predictions, true_labels, labels = ['positive', 'negative', 'neutral', 'irrelevant']))

In [11]:
def float_to_percents(number):
    return round(number * 100, 2)

In [12]:
def metrics(predictions, true_labels):
    accuracy = float_to_percents(accuracy_score(predictions, true_labels))
    f1_score_weighted = float_to_percents(f1_score(predictions, true_labels, average = 'weighted'))
    f1_score_micro = float_to_percents(f1_score(predictions, true_labels, average = 'micro'))
    f1_score_macro = float_to_percents(f1_score(predictions, true_labels, average = 'macro'))
    return [accuracy, f1_score_weighted, f1_score_micro, f1_score_macro]

In [13]:
def create_metrics_df(metrics_dict):
    df = pd.DataFrame.from_dict(metrics_dict, orient='index')
    df = df.rename(columns={0:'Accuracy', 1: 'Weighted F1 score', 2: 'Micro F1 score', 3: 'Macro F1 score'})
    return df.sort_values(by=['Accuracy'], ascending=False)

# MODELS

In [14]:
best_metrics = {}

In [15]:
vectorizers = CountVectorizer, TfidfVectorizer
vectorizers_names = ['CountVectorizer', 'TfidfVectorizer']

In [16]:
classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial'), LinearSVC(), RandomForestClassifier()]
classifiers_names = ['LogisticRegression', 'LinearSVC', 'Random Forest']

## Ngrams

In [17]:
ngrams_models_metrics = {}
ngram_ranges = [(1, 1), (1, 2), (2, 2)]
max_features = [4000, 5000, 6000, 10000, 15000]

for ngram_range in ngram_ranges:
    for vct_name, vectorizer in zip(vectorizers_names, vectorizers):
        for clf_name, classifier in zip(classifiers_names, classifiers):
            for mf in max_features:
                clf = Pipeline([
                    ('vectorizer', vectorizer(analyzer="word",
                                                   ngram_range=ngram_range,
                                                   tokenizer=word_tokenize,
                                                   max_features=mf  ) ),
                    ('classifier', classifier)
                ])


                clf.fit(train_text_cleaned, train_sentiment)
                pred =clf.predict(test_text_cleaned)

                name = 'NGR: ' + str(ngram_range) + '; V: ' + vct_name + '; CLF: ' + clf_name + '; MF: ' + str(mf) 
                ngrams_models_metrics[name] = metrics(pred, test_sentiment)



### The best configurations:

In [18]:
df_ngrams_metrics = pd.DataFrame.from_dict(ngrams_models_metrics, orient='index')
df_ngrams_metrics = df_ngrams_metrics.rename(columns={0:'Accuracy', 1: 'Weighted F1 score', 2: 'Micro F1 score', 3: 'Macro F1 score'})
df_ngrams_metrics.sort_values(by=['Accuracy'], ascending=False).head()

Unnamed: 0,Accuracy,Weighted F1 score,Micro F1 score,Macro F1 score
"NGR: (1, 2); V: CountVectorizer; CLF: LogisticRegression; MF: 4000",78.95,80.03,78.95,69.52
"NGR: (1, 2); V: CountVectorizer; CLF: LogisticRegression; MF: 5000",78.95,79.98,78.95,69.85
"NGR: (1, 1); V: TfidfVectorizer; CLF: LinearSVC; MF: 5000",78.65,79.33,78.65,71.19
"NGR: (1, 2); V: CountVectorizer; CLF: LogisticRegression; MF: 6000",78.36,79.37,78.36,69.35
"NGR: (1, 1); V: TfidfVectorizer; CLF: LinearSVC; MF: 6000",78.07,78.8,78.07,70.6


### And the worst ones:

In [19]:
df_ngrams_metrics.sort_values(by=['Accuracy'], ascending=False).tail()

Unnamed: 0,Accuracy,Weighted F1 score,Micro F1 score,Macro F1 score
"NGR: (2, 2); V: CountVectorizer; CLF: Random Forest; MF: 15000",59.36,63.47,59.36,47.66
"NGR: (2, 2); V: CountVectorizer; CLF: Random Forest; MF: 10000",59.36,62.52,59.36,48.75
"NGR: (2, 2); V: TfidfVectorizer; CLF: Random Forest; MF: 15000",59.36,62.23,59.36,52.02
"NGR: (2, 2); V: TfidfVectorizer; CLF: Random Forest; MF: 6000",57.6,60.55,57.6,48.09
"NGR: (2, 2); V: TfidfVectorizer; CLF: Random Forest; MF: 10000",56.43,59.21,56.43,46.96


In [20]:
best_metrics['Ngrams'] = ngrams_models_metrics['NGR: (1, 2); V: CountVectorizer; CLF: LogisticRegression; MF: 4000']

## If we could drop the irrelevant ones from both train and test set...

In [37]:
ngrams_models_metrics_without_irrelevant = {}
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer="word", ngram_range=(1,1), tokenizer=word_tokenize,
                                                   max_features= 5000  ) ),
                    ('classifier', LinearSVC())])

clf.fit(train_text_without_irrelevant_cleaned, train_sentiment_without_irrelevant)
pred =clf.predict(test_text_without_irrelevant_cleaned)
ngrams_models_metrics_without_irrelevant['NGR: (1, 1); V: TfidfVectorizer; CLF: LinearSVC; MF: 5000'] = metrics(pred, test_sentiment_without_irrelevant)

In [38]:
create_metrics_df(ngrams_models_metrics_without_irrelevant)

Unnamed: 0,Accuracy,Weighted F1 score,Micro F1 score,Macro F1 score
"NGR: (1, 1); V: TfidfVectorizer; CLF: LinearSVC; MF: 5000",79.75,80.85,79.75,70.52


## ... it would be skightly better, but we cannot.

## Maybe predicting 'irrelevant' for every not English tweet will help? 

In [39]:
pred = clf.predict(test_text_cleaned)

for i in range(len(test_text)):
    language = 'en'
    try:
        language = detect(test_text[i])
    except Exception:
        pass
    if language != 'en':
        pred[i] = 'irrelevant'

In [42]:
test_sentiment_cropped = []
for i in range(len(test_sentiment)):
    if test_sentiment[i] == 'irrelevant':
        test_sentiment_cropped.append('irreleva')
    else:
        test_sentiment_cropped.append(test_sentiment[i])

In [43]:
d = {'Detecting language': metrics(pred, test_sentiment_cropped)}
create_metrics_df(d)

Unnamed: 0,Accuracy,Weighted F1 score,Micro F1 score,Macro F1 score
Detecting language,76.02,76.8,76.02,69.52


## No, not really.
### The problem is that the language detection performs poorly.

# What else was done?

In [44]:
def sinononyms_antonyms(word):
    synonyms = []
    antonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
            if l.antonyms():
                antonyms.append(l.antonyms()[0].name())
    return synonyms, antonyms

In [45]:
positive = []
negative = []

positive_seed = ['good', 'awesome', 'useful', 'great', 'love', 'favourite']
for word in positive_seed:
    pos, neg = sinononyms_antonyms(word)
    positive.extend(pos)
    negative.extend(neg)

negative_seed = ['bad', 'hate', 'useless', 'awful', 'dislike', 'shit', 'terrible', 'suck']
for word in negative_seed:
    neg, pos = sinononyms_antonyms(word)
    positive.extend(pos)
    negative.extend(neg)

In [46]:
positive_in_train = []
negative_in_train = []
for i in range(len(train_text)):
    tweet = train_text[i].lower()
    for pos in positive:
        if (pos in tweet) and (train_sentiment[i] == 'positive'):
            positive_in_train.append(pos)
    for neg in negative:
        if (neg in tweet) and (train_sentiment[i] == 'negative'):
            negative_in_train.append(neg)
positive_in_train = list(dict.fromkeys(positive_in_train))
negative_in_train = list(dict.fromkeys(negative_in_train))

### Positive adjectives

In [47]:
print(positive_in_train)

['just', 'love', 'great', 'bed', 'useful', 'good', 'amazing', 'eff', 'right', 'well', 'like', 'know', 'nice', 'awesome', 'keen', 'sound', 'dear', 'big', 'bully', 'neat', 'near', 'full', 'serious', 'enjoy', 'secure', 'fuck', 'favorite']


### Negative adjectives

In [48]:
print(negative_in_train)

['ill', 'evil', 'make', 'shit', 'suck', 'jack', 'hoot', 'awful', 'painful', 'crap', 'bad', 'hate', 'sorry', 'rat', 'awesome', 'damn', 'terrible', 'useless', 'sucking', 'dire', 'bastard', 'bullshit', 'bull', 'dump', 'stag', 'big']


In [49]:
pred = []
for tweet in test_text_without_irrelevant:
    n = 0
    p = 0
    sentiment = 'neutral'
    for pos in positive_in_train:
        if pos in tweet:
            p +=1 
    for neg in negative_in_train:
        if neg in tweet:
            n +=1
    if (p>0) or (n>0):
        if p>n:
            sentiment = 'positive'
        else: 
            sentiment = 'negative'
    pred.append(sentiment)

In [50]:
d = {'Playing with adjectives': metrics(pred, test_sentiment_without_irrelevant)}
create_metrics_df(d)

Unnamed: 0,Accuracy,Weighted F1 score,Micro F1 score,Macro F1 score
Playing with adjectives,57.81,59.31,57.81,36.92


### The results are by no means satisfying, but maybe this approach can be used in a more sofisticated manner, maybe together with some other baseline model.

So the best model for sentiment calssification is **NGR: (1, 2); V: CountVectorizer; CLF: LogisticRegression; MF: 4000**.
It achieved the  following results:
* **Accuracy**: 78.95
* **Weightes F1 score**: 80.03
* **Micro F1 score**: 78.95 
* **Macro F1 score**: 69.592

# PRODUCT CLASSIFICATION

Preprocessing is slightly different since we don't want to remove mentions and hashtexts which often contain product names.

In [51]:
def preprocess_product(texts):
    documents = []
    pr.set_options(pr.OPT.URL)
    stemmer = WordNetLemmatizer()

    for sen in range(0, len(texts)):
        # remove URLs, hashtags and mentions
        document = pr.clean(texts[sen])

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()

        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)

        documents.append(document)
        
    return documents

In [52]:
train_text_product = preprocess_product(train_text)
test_text_product = preprocess_product(test_text)

In [53]:
product_metrics = {}
ngram_ranges = [(1, 1), (1, 2), (2, 2)]
max_features = [4000, 5000, 6000, 10000, 15000]

for ngram_range in ngram_ranges:
    for vct_name, vectorizer in zip(vectorizers_names, vectorizers):
        for clf_name, classifier in zip(classifiers_names, classifiers):
            for mf in max_features:
                clf = Pipeline([
                    ('vectorizer', vectorizer(analyzer="word",
                                                   ngram_range=ngram_range,
                                                   tokenizer=word_tokenize,
                                                   max_features=mf  ) ),
                    ('classifier', classifier)
                ])


                clf.fit(train_text_product, train_topics)
                pred =clf.predict(test_text_product)

                name = 'NGR: ' + str(ngram_range) + '; V: ' + vct_name + '; CLF: ' + clf_name + '; MF: ' + str(mf) 
                product_metrics[name] = metrics(pred, test_topics)



In [55]:
create_metrics_df(product_metrics).head()

Unnamed: 0,Accuracy,Weighted F1 score,Micro F1 score,Macro F1 score
"NGR: (1, 2); V: CountVectorizer; CLF: LogisticRegression; MF: 15000",86.26,86.26,86.26,85.77
"NGR: (1, 1); V: CountVectorizer; CLF: LinearSVC; MF: 10000",86.26,86.36,86.26,85.63
"NGR: (1, 1); V: CountVectorizer; CLF: LinearSVC; MF: 15000",86.26,86.36,86.26,85.63
"NGR: (1, 2); V: TfidfVectorizer; CLF: LinearSVC; MF: 10000",86.26,86.29,86.26,85.74
"NGR: (1, 2); V: CountVectorizer; CLF: LogisticRegression; MF: 10000",85.96,85.92,85.96,85.5


So the best model for product calssification is **NGR: (1, 2); V: CountVectorizer; CLF: LogisticRegression; MF: 15000**.
It achieved the following results:
* **Accuracy**: 86.26
* **Weightes F1 score**: 86.26
* **Micro F1 score**: 86.26
* **Macro F1 score**: 85.63