# **1. Problem Statement and Importing Data**

In [96]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import TreebankWordTokenizer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import urllib
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support

nltk.download('wordnet')
nltk.download('stopwords')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Import CSV files into kaggle notebook
# sample_submission_data = pd.read_csv('../input/climate-change-edsa202021-data/sample_submission.csv')
# test_data = pd.read_csv('../input/climate-change-edsa202021-data/test.csv')
# train_data = pd.read_csv('../input/climate-change-edsa202021-data/train.csv')

In [2]:
# Import CSV files into colabs
from google.colab import files
uploaded = files.upload()



Saving sample_submission.csv to sample_submission.csv
Saving test.csv to test.csv
Saving train.csv to train.csv


In [3]:
sample_submission_data = pd.read_csv('sample_submission.csv')
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

# **2. Tweets Preprocessing and Cleaning**

2.1 Make up of training data

In [4]:
#Sample of the training data
train_data.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [5]:
#Sample of tweet message in training data

train_data['message'][10000]

"The Washington Post LIES Non-Stop, like THIS: 'As Trump halts Fed action on climate change, cities &amp; states push on' https://t.co/4vOLbKiiLz"

In [6]:
#Type of data for each column
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  15819 non-null  int64 
 1   message    15819 non-null  object
 2   tweetid    15819 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 370.9+ KB


2.2 Preprocessing of training data

In [7]:
clean_data = train_data.copy()

In [8]:
stop_words = set(stopwords.words('english'))

In [9]:
print(stop_words)

{'further', "she's", 'am', 'because', 'i', 'against', 'them', 'by', "isn't", 'an', 'why', 'yourself', 'have', 'too', 'very', 'as', "couldn't", "didn't", 'here', 'at', "mustn't", "mightn't", 'after', 'we', 'can', 'd', 'for', 'its', 'but', 'own', 'now', 'ain', 'shan', 'him', 'other', 'most', "needn't", 'your', 'off', 'which', 'don', 'were', 'itself', 'those', 'he', 'while', 'me', 'haven', 'has', 'there', 'didn', "shan't", 'll', 'same', "haven't", 'she', 'was', 'if', "you'll", 'our', 'before', 'or', 'himself', 'this', 'up', "shouldn't", 'you', "aren't", 'doing', 't', 'won', 'hadn', 'herself', 'couldn', 'their', 'ours', 'both', "that'll", 'should', 'hers', 'again', "hadn't", 'had', 'mightn', "it's", 'they', 'mustn', 'with', 'aren', "wasn't", 'few', 'these', 've', 'over', "weren't", 'of', 'who', 'a', 'some', 'be', 'his', 'needn', 'just', 'myself', 'and', 'o', 'hasn', 'not', 'did', 'than', 'above', 'having', 'where', 'been', 'the', 'yours', 'into', 'does', 'isn', 'once', 'wouldn', "you've", 

In [None]:
def clean_text(text):
    '''Remove various features from tweet strings'''

    text = str(text).lower() # Make lower case
    text = re.sub('\[.*?\]', '', text) # Remove square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text) # Remove URLs
    text = re.sub('<.*?>+', '', text)
    text = re.sub('@[^\s]+', 'AT_USER', text) # Replace user names with 'AT_USER'
    text = "".join([char.lower() for char in text if char not in string.punctuation]) # Reomove punctuation  
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'#([^\s]+)', r'\1', text) # Remove the # in #hashtag

    return text

In [None]:
# Create new column with cleaned tweets
# clean_data['clean'] = train_data['message'].apply(lambda x:clean_text(x))


In [10]:
# Skipped cleaning the tweet data because it made the model worse
clean_data['clean'] = train_data['message']

In [11]:
clean_data.head()

Unnamed: 0,sentiment,message,tweetid,clean
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,PolySciMajor EPA chief doesn't think carbon di...
1,1,It's not like we lack evidence of anthropogeni...,126103,It's not like we lack evidence of anthropogeni...
2,2,RT @RawStory: Researchers say we have three ye...,698562,RT @RawStory: Researchers say we have three ye...
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,#TodayinMaker# WIRED : 2016 was a pivotal year...
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,"RT @SoyNovioDeTodas: It's 2016, and a racist, ..."


In [12]:
clean_data['message'].head()

0    PolySciMajor EPA chief doesn't think carbon di...
1    It's not like we lack evidence of anthropogeni...
2    RT @RawStory: Researchers say we have three ye...
3    #TodayinMaker# WIRED : 2016 was a pivotal year...
4    RT @SoyNovioDeTodas: It's 2016, and a racist, ...
Name: message, dtype: object

In [13]:
#Function to tokenize tweet data
def tokenize_column_data(df, column_name):
  tweet_tokenizer = TweetTokenizer()

  tweet_tokens = []
  for index, value in clean_data[column_name].items():
      
      tweet_tokens.append(tweet_tokenizer.tokenize(value))

  
  df['tokenized'] = np.array(tweet_tokens)
  df['tokenized'].apply(lambda x: [item for item in x if item not in stopwords.words('english')])
  
  return df


In [14]:
# Create new column with tokenized tweet
clean_data = tokenize_column_data(train_data, 'clean')

  # This is added back by InteractiveShellApp.init_path()


In [15]:
clean_data.head()

Unnamed: 0,sentiment,message,tweetid,tokenized
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,"[PolySciMajor, EPA, chief, doesn't, think, car..."
1,1,It's not like we lack evidence of anthropogeni...,126103,"[It's, not, like, we, lack, evidence, of, anth..."
2,2,RT @RawStory: Researchers say we have three ye...,698562,"[RT, @RawStory, :, Researchers, say, we, have,..."
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,"[#TodayinMaker, #, WIRED, :, 2016, was, a, piv..."
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,"[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ..."


In [None]:
#Remove stopwords from tweets
remove_stopwords = []
for index, value in clean_data['tokenized'].items():
  tokens_without_sw = [word for word in value if not word in stopwords.words()]
      
  remove_stopwords.append(tokens_without_sw)

In [None]:
#Add column to clean_data with removed stopwords
#clean_data['nostopwords'] = np.array(remove_stopwords)


In [16]:
# Did not include the remove stopwords step because it made the model worse
clean_data['nostopwords'] = clean_data['tokenized']

In [17]:
clean_data.head()

Unnamed: 0,sentiment,message,tweetid,tokenized,nostopwords
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,"[PolySciMajor, EPA, chief, doesn't, think, car...","[PolySciMajor, EPA, chief, doesn't, think, car..."
1,1,It's not like we lack evidence of anthropogeni...,126103,"[It's, not, like, we, lack, evidence, of, anth...","[It's, not, like, we, lack, evidence, of, anth..."
2,2,RT @RawStory: Researchers say we have three ye...,698562,"[RT, @RawStory, :, Researchers, say, we, have,...","[RT, @RawStory, :, Researchers, say, we, have,..."
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,"[#TodayinMaker, #, WIRED, :, 2016, was, a, piv...","[#TodayinMaker, #, WIRED, :, 2016, was, a, piv..."
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,"[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ...","[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ..."


In [18]:
#Lemmatize tweet data
lemma_list = []
for index, value in clean_data['nostopwords'].items():
  lemmatizer = WordNetLemmatizer()

  
  lemma_tokens = [lemmatizer.lemmatize(w) for w in value]
      
  lemma_list.append(lemma_tokens)


In [None]:
#Add column to clean_data with lemmatized words
#clean_data['lemma'] = np.array(lemma_list)

In [19]:
# Skipped the lemmatization step becuase it made the model worse
clean_data['lemma'] = clean_data['nostopwords']

In [20]:
clean_data.head()

Unnamed: 0,sentiment,message,tweetid,tokenized,nostopwords,lemma
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,"[PolySciMajor, EPA, chief, doesn't, think, car...","[PolySciMajor, EPA, chief, doesn't, think, car...","[PolySciMajor, EPA, chief, doesn't, think, car..."
1,1,It's not like we lack evidence of anthropogeni...,126103,"[It's, not, like, we, lack, evidence, of, anth...","[It's, not, like, we, lack, evidence, of, anth...","[It's, not, like, we, lack, evidence, of, anth..."
2,2,RT @RawStory: Researchers say we have three ye...,698562,"[RT, @RawStory, :, Researchers, say, we, have,...","[RT, @RawStory, :, Researchers, say, we, have,...","[RT, @RawStory, :, Researchers, say, we, have,..."
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,"[#TodayinMaker, #, WIRED, :, 2016, was, a, piv...","[#TodayinMaker, #, WIRED, :, 2016, was, a, piv...","[#TodayinMaker, #, WIRED, :, 2016, was, a, piv..."
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,"[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ...","[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ...","[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ..."


In [21]:
#Concatinate tokenised sentence before vectorization
concat_list = []
for index, value in clean_data['nostopwords'].items():

  concat = " ".join(value)

      
  concat_list.append(concat)


In [22]:
#Add column to clean_data with concatinated sentence
clean_data['concatenate'] = np.array(concat_list)

In [23]:
clean_data.head()

Unnamed: 0,sentiment,message,tweetid,tokenized,nostopwords,lemma,concatenate
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,"[PolySciMajor, EPA, chief, doesn't, think, car...","[PolySciMajor, EPA, chief, doesn't, think, car...","[PolySciMajor, EPA, chief, doesn't, think, car...",PolySciMajor EPA chief doesn't think carbon di...
1,1,It's not like we lack evidence of anthropogeni...,126103,"[It's, not, like, we, lack, evidence, of, anth...","[It's, not, like, we, lack, evidence, of, anth...","[It's, not, like, we, lack, evidence, of, anth...",It's not like we lack evidence of anthropogeni...
2,2,RT @RawStory: Researchers say we have three ye...,698562,"[RT, @RawStory, :, Researchers, say, we, have,...","[RT, @RawStory, :, Researchers, say, we, have,...","[RT, @RawStory, :, Researchers, say, we, have,...",RT @RawStory : Researchers say we have three y...
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,"[#TodayinMaker, #, WIRED, :, 2016, was, a, piv...","[#TodayinMaker, #, WIRED, :, 2016, was, a, piv...","[#TodayinMaker, #, WIRED, :, 2016, was, a, piv...",#TodayinMaker # WIRED : 2016 was a pivotal yea...
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,"[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ...","[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ...","[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ...","RT @SoyNovioDeTodas : It's 2016 , and a racist..."


In [56]:
# The tweets did not need to be pre-processed because it made the model worse so fo the y data the original tweet 'message' was used
y = clean_data['sentiment']
X = clean_data['message']

In [67]:
# Vectorize the tweet messages
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

Class Description 

2 News: the tweet links to factual news about climate change 

1 Pro: the tweet supports the belief of man-made climate change 

0 Neutral: the tweet neither supports nor refutes the belief of man-made climate change 

-1 Anti: the tweet does not believe in man-made climate change

In [68]:
# Imbalnced data shown below
clean_data.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [188]:
# Use SMOTE to address the class imbalances in the data
smote = SMOTE(sampling_strategy="not majority")
X_sm, y_sm = smote.fit_sample(X_vectorized, y)



# **3. Training the model and evaluating using the validation set**

In [190]:
#Split data into train and test
X_train,X_val,y_train,y_val = train_test_split(X_sm,y_sm,test_size=.3, random_state=11) #, shuffle=True

In [191]:
# Use a RandomForestClassifier as a model
rfc = RandomForestClassifier(class_weight="balanced_subsample")
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_val)

In [192]:
f1_score(y_val, rfc_pred, average="macro")

0.8927836709909829

In [193]:
# Try train a LogisticsRegression model
lg = LogisticRegression()
lg.fit(X_train, y_train)
lg_pred = rfc.predict(X_val)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [194]:
f1_score(y_val, lg_pred, average="macro")

0.8927836709909829

# **4. Cross Validation**

In [195]:
print(classification_report(y_val, rfc_pred))

              precision    recall  f1-score   support

          -1       0.99      0.94      0.97      2577
           0       0.91      0.88      0.89      2549
           1       0.79      0.85      0.82      2596
           2       0.89      0.89      0.89      2514

    accuracy                           0.89     10236
   macro avg       0.90      0.89      0.89     10236
weighted avg       0.89      0.89      0.89     10236



# **4. Hyperparameter tuning**

In [125]:
# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [  {"classifier": [MultinomialNB()],
                  "classifier__alpha": [1, 1e-1, 1e-2]
                 },
              
                {"classifier": [AdaBoostClassifier()],
                 'classifier__n_estimators': [50, 100],
                 'classifier__learning_rate' : [0.01,0.05,0.1,0.3,1],
                #  'classifier__loss' : ['linear', 'square', 'exponential']
                },
              
                {"classifier": [LinearSVC()],
                "classifier__C":[1, 10, 100],
                # "classifier__gamma":[0.1, 0.01]
                },               
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] # The solver does not allow for 'l1' penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]}
              ]

gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1, scoring='f1_macro') # Fit grid search
best_model = gridsearch.fit(X_train,y_train) #Find the best model

In [196]:
#Check for the best model after hyperparameter tuning
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_val,y_val))

Pipeline(memory=None,
         steps=[('classifier',
                 LinearSVC(C=10, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=None,
                           tol=0.0001, verbose=0))],
         verbose=False)
The mean accuracy of the model is: 0.9835109438700915


In [197]:
# Use the best model on the training data
lm_best = LinearSVC(C=10, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=None,
                           tol=0.0001, verbose=0)
lm_best.fit(X_train, y_train)
lm_best_pred = rfc.predict(X_val)

In [198]:
#Check the f1 score
f1_score(y_val, lm_best_pred, average="macro")

0.8927836709909829

In [199]:
print(classification_report(y_val, lm_best_pred))

              precision    recall  f1-score   support

          -1       0.99      0.94      0.97      2577
           0       0.91      0.88      0.89      2549
           1       0.79      0.85      0.82      2596
           2       0.89      0.89      0.89      2514

    accuracy                           0.89     10236
   macro avg       0.90      0.89      0.89     10236
weighted avg       0.89      0.89      0.89     10236



# **5. Making predictions on the test set and adding a sentiment column to our original test df**

In [200]:
testx = test_data['message']
test_vect = vectorizer.transform(testx)

In [201]:
y_pred = lm_best.predict(test_vect)


In [202]:
test_data['sentiment'] = y_pred


In [203]:
test_data.head()


Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,2
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


In [204]:
test_data[['tweetid','sentiment']].to_csv('climate_change_edsa2020-21_submission.csv', index=False)


In [205]:
files.download('climate_change_edsa2020-21_submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>