In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# TABLE OF CONTENTS 

<a id='table'></a>
### 1. [Importing Libraries](#libraries)  

### 2. [Loading Data](#train_and_test)  
    
### 3. [Cleaning Data](#cleaning)  
     
### 4. [Exploratory Data Analysis](#EDA)

### 5. [Feature Engineering](#extraction)

### 6. [Model Training](#training)

### 7. [Model Results](#findings)


## 1. Importing Libraries
<a id='libraries'></a>
   [Back to table of contents](#table)

In [None]:
# Libraries used to load dataframe and visualize data
import numpy as np 
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from yellowbrick.text import FreqDistVisualizer
from yellowbrick.features import RadViz
from wordcloud import WordCloud
import plotly.io as pio
pio.renderers.default='notebook'
%matplotlib inline

# Noise removal helper libraries
import re
import string 
from stopwordsiso import stopwords as sw
from nltk.corpus import stopwords

# Text Preprocessing
from nltk.tokenize import TweetTokenizer
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# Feature Engineering and Data preparation for modelling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Model building and training
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

#Model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

#save the final model and vectorizer
import pickle

# width_size
context = pd.option_context('display.max_colwidth', 400)

In [None]:
pip install stopwordsiso


## 2. Loading Data
<a id='train_and_test'></a>
   [Back to table of contents](#table)

In [None]:
# Loading train and test dataframes
train_df = pd.read_csv('/kaggle/input/edsa-climate-change-belief-analysis-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/edsa-climate-change-belief-analysis-2021/test.csv')

In [None]:
# Display the first 10 rows training dataset dataframe, allowing maximum width for the message column
with context:
    display(train_df.head(10))

In [None]:
# Display the first 10 rows testing dataset dataframe, allowing maximum width for the message column
with context:
    display(test_df.head(10))

## 3. Cleaning Data
<a id='cleaning'></a>
   [Back to table of contents](#table)

In [None]:
# Create function to clean data
def clean_data(df):
    
    # removing noise with regex.
    address = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)' 
    df.message.replace(to_replace = address, value = '', regex = True, inplace=True)
    df.message.replace({r'@(\w+)'}, value = '', regex = True, inplace=True)
    df.message.replace({r'\d+'}, value = '', regex = True, inplace=True)
    df.message.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
    
    # lower cases to avoid capital letters noise 
    lower_cases = lambda tweets: ''.join([i.lower() for i in tweets])
    df['message'] = df.message.apply(lower_cases)
    
    # this function removes punctuation
    punctuations = lambda tweets: ''.join([i for i in tweets if i not in string.punctuation])
    df['message'] = df.message.apply(punctuations)
    
    return df
    

In [None]:
# Display first 10 rows of clean data of train dataset, allowing max of width
train_df_clean = clean_data(train_df)

with context:
    display(train_df_clean.head(10))

In [None]:
# Display first 10 rows of clean data of test dataset
test_df_clean = clean_data(test_df)

with context:
    display(test_df_clean.head(10))

In [None]:
# Create function that tokenizes the words in a dataframe
def tokenize(df, column):
    df = df.copy()
    df[column] = df[column].apply(TweetTokenizer(reduce_len = True).tokenize)
    return df

In [None]:
# Creating a tokenized training dataframe
train_df_tokens = tokenize(train_df_clean, 'message')

with context:
    display(train_df_tokens.head(10))

In [None]:
# Creating a tokenized testing dataframe
test_df_tokens = tokenize(test_df_clean, 'message')

with context:
    display(test_df_tokens.head(10))

In [None]:
# Create a function that removes stopwords
def stop_words(df, column_name):
    df = df.copy()
    # Returns tokenized words that are not rt
    returns = lambda tweets: [i for i in tweets if i != 'rt']
    df[column_name] = df[column_name].apply(returns)
    
    #Create a function stops which returns the words in a tokenized dataframe that do not appear in a stopwords set
    stop_word = lambda tweets: [i for i in tweets if i not in sw('en')]
    df[column_name] = df[column_name].apply(stop_word)
    
    return df

In [None]:
# Call the stops function the tokenized testing dataset dataframe
train_df_stopwords = stop_words(train_df_tokens, 'message')

with context:
    display(train_df_stopwords.head(10))

In [None]:
test_df_stopwords = stop_words(test_df_tokens, 'message')

with context:
    display(test_df_stopwords.head(10))

In [None]:
# Create a function to lemmatize words in training dataframe
train_df_lemmatized = train_df_stopwords.copy()

train_df_lemmatized['message'] = train_df_lemmatized['message'].apply(lambda sentence : [WordNetLemmatizer().lemmatize(word) for word in sentence])

# Display the first 10 rows of the lemmatized_train dataframe, allowing maxmimum width for the message column
with context:
    display(train_df_lemmatized.head(10))

In [None]:
# Create a function to lemmatize words in training dataframe
test_df_lemmatized = test_df_stopwords.copy()

test_df_lemmatized['message'] = test_df_lemmatized['message'].apply(lambda sentence : [WordNetLemmatizer().lemmatize(word) for word in sentence])

# Display the first 10 rows of the lemmatized_train dataframe, allowing maxmimum width for the message column
with context:
    display(test_df_lemmatized.head(10))