In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# TABLE OF CONTENTS 

<a id='table'></a>
### 1. [Importing Libraries](#libraries)  

### 2. [Loading Data](#train_and_test)  
    
### 3. [Cleaning Data](#cleaning)  
     
### 4. [Exploratory Data Analysis](#EDA)

### 5. [Feature Engineering](#extraction)

### 6. [Modelling](#modelling)

### 7. [Model Results](#findings)


## 1. Importing Libraries
<a id='libraries'></a>
   [Back to table of contents](#table)

In [2]:
pip install stopwordsiso

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [4]:
# Libraries used to load dataframe and visualize data
import numpy as np 
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from yellowbrick.text import FreqDistVisualizer
from yellowbrick.features import RadViz
from wordcloud import WordCloud
import plotly.io as pio
pio.renderers.default='notebook'
%matplotlib inline

# Noise removal helper libraries
import re
import string 
from stopwordsiso import stopwords as sw
from nltk.corpus import stopwords

# Text Preprocessing
from nltk.tokenize import TweetTokenizer
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# Feature Engineering and Data preparation for modelling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Model building and training
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

#Model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

#save the final model and vectorizer
import pickle

# width_size
context = pd.option_context('display.max_colwidth', 400)

ModuleNotFoundError: No module named 'yellowbrick'

## 2. Loading Data
<a id='train_and_test'></a>
   [Back to table of contents](#table)

In [None]:
# Loading train and test dataframes
train_df = pd.read_csv('/kaggle/input/edsa-climate-change-belief-analysis-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/edsa-climate-change-belief-analysis-2021/test.csv')
train = pd.read_csv('/kaggle/input/edsa-climate-change-belief-analysis-2021/train.csv')
test = pd.read_csv('/kaggle/input/edsa-climate-change-belief-analysis-2021/test.csv')

In [None]:
# Display the first 10 rows training dataset dataframe, allowing maximum width for the message column
with context:
    display(train_df.head(10))

In [None]:
# Display the first 10 rows testing dataset dataframe, allowing maximum width for the message column
with context:
    display(test_df.head(10))

## 3. Cleaning Data
<a id='cleaning'></a>
   [Back to table of contents](#table)

In [None]:
# Create function to clean data
def clean_data(df):
    
    # removing noise with regex.
    address = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)' 
    df.message.replace(to_replace = address, value = '', regex = True, inplace=True)
    df.message.replace({r'@(\w+)'}, value = '', regex = True, inplace=True)
    df.message.replace({r'\d+'}, value = '', regex = True, inplace=True)
    df.message.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
    
    # lower cases to avoid capital letters noise 
    lower_cases = lambda tweets: ''.join([i.lower() for i in tweets])
    df['message'] = df.message.apply(lower_cases)
    
    # this function removes punctuation
    punctuations = lambda tweets: ''.join([i for i in tweets if i not in string.punctuation])
    df['message'] = df.message.apply(punctuations)
    
    return df
    

In [None]:
# Display first 10 rows of clean data of train dataset, allowing max of width
train_df_clean = clean_data(train_df)

with context:
    display(train_df_clean.head(10))

In [None]:
# Display first 10 rows of clean data of test dataset
test_df_clean = clean_data(test_df)

with context:
    display(test_df_clean.head(10))

In [None]:
# Create function that tokenizes the words in a dataframe
def tokenize(df, column):
    df = df.copy()
    df[column] = df[column].apply(TweetTokenizer(reduce_len = True).tokenize)
    return df

In [None]:
# Creating a tokenized training dataframe
train_df_tokens = tokenize(train_df_clean, 'message')

with context:
    display(train_df_tokens.head(10))

In [None]:
# Creating a tokenized testing dataframe
test_df_tokens = tokenize(test_df_clean, 'message')

with context:
    display(test_df_tokens.head(10))

In [None]:
# Create a function that removes stopwords
def stop_words(df, column_name):
    df = df.copy()
    # Returns tokenized words that are not rt
    returns = lambda tweets: [i for i in tweets if i != 'rt']
    df[column_name] = df[column_name].apply(returns)
    
    #Create a function stops which returns the words in a tokenized dataframe that do not appear in a stopwords set
    stop_word = lambda tweets: [i for i in tweets if i not in sw('en')]
    df[column_name] = df[column_name].apply(stop_word)
    
    return df

In [None]:
# Call the stops function the tokenized testing dataset dataframe
train_df_stopwords = stop_words(train_df_tokens, 'message')

with context:
    display(train_df_stopwords.head(10))

In [None]:
test_df_stopwords = stop_words(test_df_tokens, 'message')

with context:
    display(test_df_stopwords.head(10))

In [None]:
# Create a function to lemmatize words in training dataframe
train_df_lemmatized = train_df_stopwords.copy()

train_df_lemmatized['message'] = train_df_lemmatized['message'].apply(lambda sentence : [WordNetLemmatizer().lemmatize(word) for word in sentence])

# Display the first 10 rows of the lemmatized_train dataframe, allowing maxmimum width for the message column
with context:
    display(train_df_lemmatized.head(10))

In [None]:
# Create a function to lemmatize words in training dataframe
test_df_lemmatized = test_df_stopwords.copy()

test_df_lemmatized['message'] = test_df_lemmatized['message'].apply(lambda sentence : [WordNetLemmatizer().lemmatize(word) for word in sentence])

# Display the first 10 rows of the lemmatized_train dataframe, allowing maxmimum width for the message column
with context:
    display(test_df_lemmatized.head(10))

In [None]:
# Merge tokenized words into sentences (train_df_lammetized)
train_df_lemmatized['message'] = [' '.join(i) for i in train_df_lemmatized['message'].values]

with context:
    display(train_df_lemmatized.head(10))

In [None]:
# Merge tokenized words into sentences (test_df_lammetized)
test_df_lemmatized['message'] = [' '.join(i) for i in test_df_lemmatized['message'].values]

with context:
    display(test_df_lemmatized.head(10))

# Exploring Data (EDA)
<a id='EDA'></a>
   [Back to table of contents](#table)

* **Sentiment Dataframe**

In [None]:
# Grouping tweets by sentiment and display count in message column
sentiment_df = train_df_lemmatized.groupby('sentiment').count()['message'].reset_index().sort_values(by = 'message', ascending = False)
sentiment_df

* **Bar Graph**

In [None]:
# Visualize number of tweets using a bar plot
bar_graph = go.Figure(go.Bar(x = ['Positive', 'News', 'Neutral', 'Negative'],y = sentiment_df['message'], 
                       marker = {'color': sentiment_df['message'],'colorscale': 'Viridis'})) 
bar_graph.update_layout(yaxis_title = 'No. of Tweets', xaxis_title = 'Sentiment', title = 'No. of tweets per sentiment')
bar_graph.show()

* **Cloud Words**

In [None]:
# Collecting words from different sentiments
pos_words = " ".join([i for i in train_df_lemmatized['message'][train_df_lemmatized['sentiment'] == 1]])
neg_words = " ".join([y for y in train_df_lemmatized['message'][train_df_lemmatized['sentiment'] == -1]])
neutral_words = " ".join([i for i in train_df_lemmatized['message'][train_df_lemmatized['sentiment'] == 0]])
news_words = " ".join([y for y in train_df_lemmatized['message'][train_df_lemmatized['sentiment'] == 2]])

In [None]:
# List of frequent words in wordscloud 
freq_words = ['warming', 'change', 'climate', 'global']
new_pos = " ".join([i for i in pos_words.split() if i not in freq_words])
new_neg = " ".join([y for y in neg_words.split() if y not in freq_words])
new_neutral = " ".join([i for i in neutral_words.split() if i not in freq_words])
new_news = " ".join([y for y in news_words.split() if y not in freq_words])

* **Hashtags Extraction**

In [None]:
# Hash_tags function to extract hashtags
def hashtag_function(tweet):
    hash_tags = []
    for i in tweet: 
        tags = re.findall(r"#(\w+)", i)
        hash_tags.append(tags)
    return hash_tags

In [None]:
# Extracting hashtags associated to positive, negative, neutral and news class
pos_tags = hashtag_function(train['message'][train['sentiment'] == 1])
neg_tags = hashtag_function(train['message'][train['sentiment'] == -1])
neutral_tags = hashtag_function(train['message'][train['sentiment'] == 0])
news_tags = hashtag_function(train['message'][train['sentiment'] == 2])

In [None]:
# Create a list for every sentiment
pos_tags = sum(pos_tags, [])
neg_tags = sum(neg_tags, [])
neutral_tags = sum(neutral_tags, [])
news_tags = sum(news_tags, [])


In [None]:
# Displaying the most frequent words in positive, negative, neutral and news hashtags list
word_pos = nltk.FreqDist(pos_tags)
word_neg = nltk.FreqDist(neg_tags)
word_neutral = nltk.FreqDist(neutral_tags)
word_news = nltk.FreqDist(news_tags)

#Dataframes
word_pos_df = pd.DataFrame({'Hashtags' : list(word_pos.keys()),'Count' : list(word_pos.values())})
word_neg_df = pd.DataFrame({'Hashtags' : list(word_neg.keys()),'Count' : list(word_neg.values())})
word_neutral_df = pd.DataFrame({'Hashtags' : list(word_neutral.keys()),'Count' : list(word_neutral.values())})
word_news_df = pd.DataFrame({'Hashtags' : list(word_news.keys()),'Count' : list(word_news.values())})

# Sorting in descending order
word_pos_df_sorted = word_pos_df.sort_values(by = "Count", ascending = False)
word_neg_df_sorted = word_neg_df.sort_values(by = "Count", ascending = False)
word_neutral_df_sorted = word_neutral_df.sort_values(by = "Count", ascending = False)
word_news_df_sorted = word_news_df.sort_values(by = "Count", ascending = False)

In [None]:
# Display first 10 rows of most frequent words of positive sentiment
word_pos_df_sorted.head(10)

In [None]:
# Display first 10 rows of most frequent words of negative sentiment
word_neg_df_sorted.head(10)

In [None]:
# Display first 10 rows of most frequent words of news sentiment
word_news_df_sorted.head(10)

In [None]:
# Display first 10 rows of most frequent words of news sentiment
word_neutral_df_sorted.head(10)

* **Words extraction**

In [None]:
# Lambda function for extracting words
words_extractor = lambda words:  " ".join([i for i in words.split() if i not in freq_words])

word_pos = train_df_lemmatized[train_df_lemmatized['sentiment'] == 1] 
word_pos = word_pos['message'].apply(words_extractor)
word_neg = train_df_lemmatized[train_df_lemmatized['sentiment'] == -1]
word_neg = word_neg['message'].apply(words_extractor)
word_neutral = train_df_lemmatized[train_df_lemmatized['sentiment'] == 0]
word_neutral = word_neutral['message'].apply(words_extractor)
word_news = train_df_lemmatized[train_df_lemmatized['sentiment'] == 2]
word_news = word_news['message'].apply(words_extractor)

* **Call CountVectorizer**

In [None]:
# Use CountVectorizer to transform word_positive, word_negative, word_neutral and word_news
# word_positive
countV_pos = CountVectorizer()
docs_positive = countV_pos.fit_transform(word_pos)
features_positive = countV_pos.get_feature_names()

# word_negative
countV_neg = CountVectorizer()
docs_negative = countV_neg.fit_transform(word_neg)
features_negative = countV_neg.get_feature_names()

# word_neutral
countV_neutral = CountVectorizer()
docs_neutral = countV_neutral.fit_transform(word_neutral)
features_neutral = countV_neutral.get_feature_names()

# word_news
countV_news = CountVectorizer()
docs_news = countV_news.fit_transform(word_news)
features_news = countV_news.get_feature_names()

* **Frequency Distribution Visualizer**

In [None]:
# Display frequency distribution of top 10 tokens for positive sentiment'
fDist_pos = FreqDistVisualizer(features = features_positive, orient = 'v', n = 10, 
            color = 'm', title = 'frequency_distribution of top 10 tokens for positive sentiment')
visual_pos = RadViz(classes = docs_positive, features = features_positive, size = (800, 420))

fDist_pos.fit(docs_positive)
fDist_pos.show() 

In [None]:
# Display frequency distribution of top 10 tokens for negative sentiment'
fDist_neg = FreqDistVisualizer(features = features_negative, orient = 'v', n = 10, 
            color = 'c', title = 'frequency_distribution of top 10 tokens for negative sentiment')
visual_neg = RadViz(classes = docs_negative, features = features_negative, size = (800, 420))

fDist_neg.fit(docs_negative)
fDist_neg.show() 

In [None]:
# Display frequency distribution of top 10 tokens for neutral sentiment'
fDist_neutral = FreqDistVisualizer(features = features_neutral, orient = 'v', n = 10, 
            color = 'b', title = 'frequency_distribution of top 10 tokens for negative sentiment')
visual_neutral = RadViz(classes = docs_neutral, features = features_neutral, size = (800, 420))

fDist_neutral.fit(docs_neutral)
fDist_neutral.show()

In [None]:
# Display frequency distribution of top 10 tokens for news sentiment
fDist_news = FreqDistVisualizer(features = features_news, orient = 'v', n = 10, 
            color = 'r', title = 'frequency_distribution of top 10 tokens for news sentiment')
visual_news = RadViz(classes = docs_news, features = features_news, size = (800, 420))

fDist_news.fit(docs_news)
fDist_news.show() 

# 5. Feature  Engineering
<a id='extraction'></a>
   [Back to table of contents](#table)

In [None]:
def preprocessing(message):
    
    # set to lower cases
    str_msg = message.lower()
    str_msg = re.sub(r"http\S+", "", str_msg)
    
    # tokenize string
    tokenized_str = TweetTokenizer(strip_handles = True)
    str_msg = tokenized_str.tokenize(str_msg)
    
    # join and extract string.
    str_msg = " ".join(str_msg)
    str_msg = re.sub(r'[^a-z0-9\s]', '', str_msg)
    str_msg = re.sub(r'[0-9]+', '', str_msg)
    
    tweet = re.sub(r'^rt', '', str_msg)
    return tweet

In [None]:
# Count Vectorizing the train dataset
train_df = train.copy()
train_df['message']= train_df['message'].apply(preprocessing)

In [None]:
# Declare variable x and y
x = train_df['message']
y = train_df['sentiment']

In [None]:
# Call CountVectorizer 
count_vector = CountVectorizer(ngram_range =(1,2))
X = count_vector.fit_transform(x)

X.shape

* **Train Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# 6. Modelling
<a id='modelling'></a>
   [Back to table of contents](#table)

*  **Logistic Regression Classifier**

In [None]:
# Declare logistic Regression Classifier 
Log_reg_class = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', 
                                   random_state = 42).fit(X_train, y_train)


In [None]:
# Display the predictions
pred_log_reg = Log_reg_class.predict(X_test)
pred_log_reg

In [None]:
# Checking model performance with F1 score
acc_model = f1_score(y_test,pred_log_reg,average ="weighted") 
acc_model

* **Support Vector Classifier**

In [None]:
# Declare support vector classifier model
Supp_Vect_Class = SVC(C = 10, gamma = 0.01).fit(X_train, y_train)

In [None]:
# Display the predictions
pred_supp_vect = Supp_Vect_Class.predict(X_test)
pred_supp_vect

In [None]:
# Checking model performance with F1 score
acc_model2 = f1_score(y_test,pred_supp_vect,average ="weighted") 
acc_model2

* **SMOTE Naive Bayes Classifier**

In [None]:
# Declare SMOTE Naive Bayes Classifier
Naive_Bayes_Class = MultinomialNB().fit(X_train, y_train)

In [None]:
# Display the predictions
pred_naive_bayes = Naive_Bayes_Class.predict(X_test)
pred_naive_bayes

In [None]:
# Checking model performance with F1 score
acc_model3 = f1_score(y_test,pred_log_reg,average ="weighted") 
acc_model3

* **Conclusion**

Support Vector Classifier model is performing better than others.  

# 7. Model Results

In [None]:
# Model test data 
test_df = test.copy()
test_df['message'] = test_df['message'].apply(preprocessing)
test_count_vector =  count_vector.transform(test_df['message'])

In [None]:
# Predict on test_data
pred_supp_vect_sub = Supp_Vect_Class.predict(test_count_vector)

In [None]:
# Create the submission dataframe
submission = pd.DataFrame({'tweetid' : test_df['tweetid'], 
                           'sentiment' : pred_supp_vect_sub})
submission

In [None]:
# Save submision file on csv.
submission.to_csv("Bote_Mkwanazi_Classification_Prediction.csv", index = False)