# Classification Predict - Climate Change

### Table of contents

### Problem identification

In [None]:
# background and problem statement 

### What data do we have?

In [None]:
# description of data

### Start experiment

In [None]:
# Install necessary packages
# !pip install comet_ml

In [None]:
# from comet_ml import Experiment

In [None]:
# Setting the API key (saved as environment variable)
# experiment = Experiment(api_key="upOwchWrd7H1e6VEnWKW7PSvz", project_name="classification-predict", workspace="team-rm1")

### Import libraries

In [1]:
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from nltk import SnowballStemmer
from nltk import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

### Import data

In [2]:
df_test = pd.read_csv('https://raw.githubusercontent.com/Amogelang20/RM1_classification_predict/dev/test.csv')
df_train = pd.read_csv('https://raw.githubusercontent.com/Amogelang20/RM1_classification_predict/dev/train.csv')

In [3]:
df_train.set_index('tweetid',inplace = True)
df_train.head()

Unnamed: 0_level_0,sentiment,message
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
625221,1,PolySciMajor EPA chief doesn't think carbon di...
126103,1,It's not like we lack evidence of anthropogeni...
698562,2,RT @RawStory: Researchers say we have three ye...
573736,1,#TodayinMaker# WIRED : 2016 was a pivotal year...
466954,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ..."


In [4]:
df_test.set_index('tweetid',inplace = True)
df_test.head()

Unnamed: 0_level_0,message
tweetid,Unnamed: 1_level_1
169760,Europe will now be looking to China to make su...
35326,Combine this with the polling of staffers re c...
224985,"The scary, unimpeachable evidence that climate..."
476263,@Karoli @morgfair @OsborneInk @dailykos \r\nPu...
872928,RT @FakeWillMoore: 'Female orgasms cause globa...


### Exploratory data analysis

In [None]:
# look at the type of data that is present. 
# look at the types and number of columns present. 
# look at the y variable, the number of classes it has. 
# check for imbalance of data in the different classes of y variable. 
# check for missing values.
# visualise the data.


### Preprocessing

In [None]:
#### CLEANING ####
# handle some of the unnecessary punctuation 
# upper/lower case
# change the slang words into something more meaningful to machine learning
# handle missing data and empty strings
# ect.


In [13]:
def data_cleaning(df):
    #substring
    
    #drop missing texts
    df.dropna(inplace = True)

    #dropping empty tweets
    blanks = []  # start with an empty list
    for i,lb,tweet in df.itertuples():  # iterate over the DataFrame
        if type(tweet)==str:            # avoid NaN values
            if tweet.isspace():         # test 'tweet' for whitespace
                blanks.append(i)     # add matching index numbers to the list
    df.drop(blanks, inplace=True)

    #lower case all words to remove noise from Capital words. Capital words may be seen as different from lower case words
    df['message'] = df['message'].str.lower()
    
    df['message'] = df['message'].replace(r'&amp;', 'and', regex=True) #replace & with and
    df['message'] = df['message'].replace(r'https\S+','', regex=True).replace(r'www\S+', '', regex=True) #removing urls
    df['message'] = df['message'].str.replace('rt','retweet') #replace 'rt' with retweet
    df['message'] = df['message'].str.replace('[^\w\s]','') #removing punctuations

    # remove duplicate tweets
    df_train.drop_duplicates(subset=['message'],keep = 'first',inplace = True)
    
    return df

In [None]:
df_train = data_cleaning(train_df)

### Feature creation

In [None]:
# Create some features from the given tweets,e.g length of tweet. Visualise these created features


### Split data into response and predictors

In [None]:
#### clean_message = the cleaned text data ####
y = df_train['sentiment']
X = df_train['clean_message']


### Split data into training and validation sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Build pipelines to vectorize the data

In [None]:
# Stem, tokenize and remove stopwords (all done within vectorization)
# Build a pipeline that vectorizes the text and creates classifiers for the different models 
# (logistic reg, SVM, Naive Bayes, Random Forest, Neural Nets)


In [None]:
# Write class that has object that tokenizes text data AND stems the tokens
class StemAndTokenize:
    def __init__(self):
        self.ss = SnowballStemmer('english')
    def __call__(self, doc):
        return [self.ss.stem(t) for t in word_tokenize(doc)]


#### Logistic Regression

In [None]:
# Create pipeline for Logistic Regression:
lr = Pipeline([('tfidf', TfidfVectorizer(tokenizer=StemAndTokenize())),
               ('lr', LogisticRegression())
              ])


#### Naïve Bayes

In [None]:
# Create pipeline for Naïve Bayes:
nb = Pipeline([('tfidf', TfidfVectorizer(tokenizer=StemAndTokenize())),
               ('nb', MultinomialNB())
              ])


#### Linear SVM (Support Vector Machine)

In [None]:
# Create pipeline for SVM:
Lsvm = Pipeline([('tfidf', TfidfVectorizer(tokenizer=StemAndTokenize())),
               ('Lsvm', LinearSVC())
              ])


#### Random Forest

In [None]:
# Create pipeline for Random Forest:
rf = Pipeline([('tfidf', TfidfVectorizer(tokenizer=StemAndTokenize())),
               ('rf', RandomForestClassifier())
              ])


#### KNN (K Nearest Neighbors)

In [None]:
# Create pipeline for KNN:
knn = Pipeline([('tfidf', TfidfVectorizer(tokenizer=StemAndTokenize())),
               ('knn', KNeighborsClassifier())
              ])


#### Neural Networks

In [None]:
# Create pipeline for Neural Networks:
nn = Pipeline([('tfidf', TfidfVectorizer(tokenizer=StemAndTokenize())),
               ('nn', MLPClassifier())
              ])


### Hyperparameter optimization

In [None]:
# optimize models by tuning parameters (GridSearch)


### Train models

#### Logistic Regression

In [None]:
# Fitting the Logistic Regression model
lr.fit(X_train, y_train)


#### Naïve Bayes

In [None]:
# Fitting the Naïve Bayes model
nb.fit(X_train, y_train)


#### Linear SVM (Support Vector Machine)

In [None]:
# Fitting the SVM model
Lsvm.fit(X_train, y_train)


#### Random Forest

In [None]:
# Fitting the Random Forest model
rf.fit(X_train, y_train)


#### KNN (K Nearest Neighbors)

In [None]:
# Fitting the KNN model
knn.fit(X_train, y_train)


#### Neural Networks

In [None]:
# Fitting the Neural Networks model
nn.fit(X_train, y_train)


### Make predictions

In [None]:
# make predicts 


### Evaluate model accuracy

In [None]:
# confusion matrix
# classification report


#### Confusion Matrices

...

#### Logistic Regression

In [None]:
labels = ['2: News', '1: Pro', '0: Neutral', '-1: Anti']

pd.DataFrame(data=confusion_matrix(y_test, pred_lr), index=labels, columns=labels)

#### Naïve Bayes

In [None]:
labels = ['2: News', '1: Pro', '0: Neutral', '-1: Anti']

pd.DataFrame(data=confusion_matrix(y_test, pred_nb), index=labels, columns=labels)

#### Linear SVM (Support Vector Machine)

In [None]:
labels = ['2: News', '1: Pro', '0: Neutral', '-1: Anti']

pd.DataFrame(data=confusion_matrix(y_test, pred_lsvm), index=labels, columns=labels)

#### Random Forest

In [None]:
labels = ['2: News', '1: Pro', '0: Neutral', '-1: Anti']

pd.DataFrame(data=confusion_matrix(y_test, pred_rf), index=labels, columns=labels)

#### KNN (K Nearest Neighbors)

In [None]:
labels = ['2: News', '1: Pro', '0: Neutral', '-1: Anti']

pd.DataFrame(data=confusion_matrix(y_test, pred_knn), index=labels, columns=labels)

#### Neural Networks

In [None]:
labels = ['2: News', '1: Pro', '0: Neutral', '-1: Anti']
pd.DataFrame(data=confusion_matrix(y_test, pred_nn), index=labels, columns=labels)

#### Classification Report

...

#### Logistic Regression

In [None]:
print('Classification Report from Logistic Regression Model')
print(classification_report(y_test, pred_lm, target_names=['2: News', '1: Pro', '0: Neutral', '-1: Anti']))

#### Naïve Bayes

In [None]:
print('Classification Report from Naïve Model')
print(classification_report(y_test, pred_lm, target_names=['2: News', '1: Pro', '0: Neutral', '-1: Anti']))

#### Linear SVM (Support Vector Machine)

In [None]:
print('Classification Report from Linear SVM (Support Vector Machine) Model')
print(classification_report(y_test, pred_lm, target_names=['2: News', '1: Pro', '0: Neutral', '-1: Anti']))

#### Random Forest

In [None]:
print('Classification Report from Random Forest Model')
print(classification_report(y_test, pred_lm, target_names=['2: News', '1: Pro', '0: Neutral', '-1: Anti']))

#### KNN (K Nearest Neighbors)

In [None]:
print('Classification Report from KNN(K Nearest Neighbours) Model')
print(classification_report(y_test, pred_lm, target_names=['2: News', '1: Pro', '0: Neutral', '-1: Anti']))

#### Neural Networks

In [None]:
print('Classification Report from Neural Networks Model')
print(classification_report(y_test, pred_lm, target_names=['2: News', '1: Pro', '0: Neutral', '-1: Anti']))

In [None]:
#Table for all models with only f1 score

In [None]:
#lr.predict(X_test) -> use predictions created above
labels = ['Model F1_score']
data = {'Model':['Logistic Regression','Naïve Bayes','Linear SVM','Random Forest','KNN','Neural Network'],
        'F1_score' :[f1_score(y_test, lr.predict(X_test)),
       f1_score(y_test, nb.predict(X_test)),
       f1_score(y_test, lsvm.predict(X_test))
       f1_score(y_test, rf.predict(X_test)),
       f1_score(y_test, knn.predict(X_test)),
       f1_score(y_test, nn.predict(X_test))]}

pd.DataFrame(data=data, columns=labels)

### Save the model

In [None]:
# pickle model


### Log parameters

In [None]:
# save parameters in variables to be logged to comet


### End experiment

### Conclusion