# Classification Predict - Climate Change

### Table of contents

### Problem identification

In [None]:
# background and problem statement 

### What data do we have?

In [None]:
# description of data

### Start experiment

In [None]:
# Install necessary packages
# !pip install comet_ml

In [None]:
# from comet_ml import Experiment

In [None]:
# Setting the API key (saved as environment variable)
# experiment = Experiment(api_key="upOwchWrd7H1e6VEnWKW7PSvz", project_name="classification-predict", workspace="team-rm1")

### Import libraries

In [2]:
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from nltk import SnowballStemmer
from nltk import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

### Import data

In [8]:
df_test = pd.read_csv('https://raw.githubusercontent.com/Amogelang20/RM1_classification_predict/dev/test.csv')
df_train = pd.read_csv('https://raw.githubusercontent.com/Amogelang20/RM1_classification_predict/dev/train.csv')

In [9]:
df_train.set_index('tweetid',inplace = True)
df_train.head()

Unnamed: 0_level_0,sentiment,message
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
625221,1,PolySciMajor EPA chief doesn't think carbon di...
126103,1,It's not like we lack evidence of anthropogeni...
698562,2,RT @RawStory: Researchers say we have three ye...
573736,1,#TodayinMaker# WIRED : 2016 was a pivotal year...
466954,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ..."


In [10]:
df_test.set_index('tweetid',inplace = True)
df_test.head()

Unnamed: 0_level_0,message
tweetid,Unnamed: 1_level_1
169760,Europe will now be looking to China to make su...
35326,Combine this with the polling of staffers re c...
224985,"The scary, unimpeachable evidence that climate..."
476263,@Karoli @morgfair @OsborneInk @dailykos \r\nPu...
872928,RT @FakeWillMoore: 'Female orgasms cause globa...


### Exploratory data analysis

In [None]:
# look at the type of data that is present. 
# look at the types and number of columns present. 
# look at the y variable, the number of classes it has. 
# check for imbalance of data in the different classes of y variable. 
# check for missing values.
# visualise the data.


### Preprocessing

In [None]:
#### CLEANING ####
# handle some of the unnecessary punctuation 
# upper/lower case
# change the slang words into something more meaningful to machine learning
# handle missing data and empty strings
# ect.


In [11]:
#drop missing texts
df_train.dropna(inplace = True)

#dropping empty tweets
blanks = []  # start with an empty list
for i,lb,tweet in df_train.itertuples():  # iterate over the DataFrame
    if type(tweet)==str:            # avoid NaN values
        if tweet.isspace():         # test 'tweet' for whitespace
            blanks.append(i)     # add matching index numbers to the list
df_train.drop(blanks, inplace=True)

#lower case all words to remove noise from Capital words. Capital words may be seen as different from lower case words
df_train = df_train['message'].str.lower()
df_train

tweetid
625221    polyscimajor epa chief doesn't think carbon di...
126103    it's not like we lack evidence of anthropogeni...
698562    rt @rawstory: researchers say we have three ye...
573736    #todayinmaker# wired : 2016 was a pivotal year...
466954    rt @soynoviodetodas: it's 2016, and a racist, ...
                                ...                        
22001     rt @ezlusztig: they took down the material on ...
17856     rt @washingtonpost: how climate change could b...
384248    notiven: rt: nytimesworld :what does trump act...
819732    rt @sara8smiles: hey liberals the climate chan...
806319    rt @chet_cannon: .@kurteichenwald's 'climate c...
Name: message, Length: 15819, dtype: object

In [12]:
def data_cleaning(text):
    #substring
    
    text = re.sub(r'https\S+','',text) #removing urls
    text = re.sub(r'&amp;','and',text)#replace & with and
    text = re.sub(r'rt','retweet',text) #replace 'rt' with retweet
    text = text.translate(str.maketrans('', '', string.punctuation)) #removing punctuations

    return df

In [22]:
# df = df_train.copy()
# df['message'] = df['message'].replace(r'&amp;', 'and', regex=True)
df_train['message']

KeyError: 'message'

In [None]:
# remove duplicate tweets
df_train['message'] = df_train['message'].drop_duplicates(subset=['message'])

### Feature creation

In [None]:
# Create some features from the given tweets,e.g length of tweet. Visualise these created features


### Split data into response and predictors

In [None]:
#### clean_message = the cleaned text data ####
y = df_train['sentiment']
X = df_train['clean_message']


### Split data into training and validation sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Build pipelines to vectorize the data

In [None]:
# Stem, tokenize and remove stopwords (all done within vectorization)
# Build a pipeline that vectorizes the text and creates classifiers for the different models 
# (logistic reg, SVM, Naive Bayes, Random Forest, Neural Nets)


In [None]:
# Write class that has object that tokenizes text data AND stems the tokens
class StemAndTokenize:
    def __init__(self):
        self.ss = SnowballStemmer('english')
    def __call__(self, doc):
        return [self.ss.stem(t) for t in word_tokenize(doc)]


#### Logistic Regression

In [None]:
# Create pipeline for Logistic Regression:
lr = Pipeline([('tfidf', TfidfVectorizer(tokenizer=StemAndTokenize())),
               ('lr', LogisticRegression())
              ])


#### Naïve Bayes

In [None]:
# Create pipeline for Naïve Bayes:
nb = Pipeline([('tfidf', TfidfVectorizer(tokenizer=StemAndTokenize())),
               ('nb', MultinomialNB())
              ])


#### Linear SVM (Support Vector Machine)

In [None]:
# Create pipeline for SVM:
Lsvm = Pipeline([('tfidf', TfidfVectorizer(tokenizer=StemAndTokenize())),
               ('Lsvm', LinearSVC())
              ])


#### Random Forest

In [None]:
# Create pipeline for Random Forest:
rf = Pipeline([('tfidf', TfidfVectorizer(tokenizer=StemAndTokenize())),
               ('rf', RandomForestClassifier())
              ])


#### KNN (K Nearest Neighbors)

In [None]:
# Create pipeline for KNN:
knn = Pipeline([('tfidf', TfidfVectorizer(tokenizer=StemAndTokenize())),
               ('knn', KNeighborsClassifier())
              ])


#### Neural Networks

In [None]:
# Create pipeline for Neural Networks:
nn = Pipeline([('tfidf', TfidfVectorizer(tokenizer=StemAndTokenize())),
               ('nn', MLPClassifier())
              ])


### Hyperparameter optimization

In [None]:
# optimize models by tuning parameters (GridSearch)


### Train models

#### Logistic Regression

In [None]:
# Fitting the Logistic Regression model
lr.fit(X_train, y_train)


#### Naïve Bayes

In [None]:
# Fitting the Naïve Bayes model
nb.fit(X_train, y_train)


#### Linear SVM (Support Vector Machine)

In [None]:
# Fitting the SVM model
Lsvm.fit(X_train, y_train)


#### Random Forest

In [None]:
# Fitting the Random Forest model
rf.fit(X_train, y_train)


#### KNN (K Nearest Neighbors)

In [None]:
# Fitting the KNN model
knn.fit(X_train, y_train)


#### Neural Networks

In [None]:
# Fitting the Neural Networks model
nn.fit(X_train, y_train)


### Evaluate model accuracy

In [None]:
# confusion matrix
# classification report


### Make predictions

In [None]:
# make predicts 


### Save the model

In [None]:
# pickle model


### Log parameters

In [None]:
# save parameters in variables to be logged to comet


### End experiment

### Conclusion