## Import the necessary libraries

In [34]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from textblob import TextBlob
from wordcloud import WordCloud

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        



## Load in your data from kaggle.  
By working in a kaggle kernel, you can access the data directly from the competition, as well as make your submission without downloading your output file

In [35]:
train = pd.read_csv('../input/edsa-climate-change-belief-analysis-2021/train.csv')
test = pd.read_csv('../input/edsa-climate-change-belief-analysis-2021/test.csv')
sample = pd.read_csv('../input/edsa-climate-change-belief-analysis-2021/sample_submission.csv')
df = pd.concat([train, test], axis=0, sort=True)

In [36]:
train.head()

In [37]:
test.head()

In [38]:
sample.head()

In [39]:
train.sentiment.value_counts()

In [40]:
plot = train['sentiment'].value_counts()
sns.barplot(plot.index,plot)

In [41]:
df.head()

In [42]:
sns.heatmap(train.isnull(), yticklabels = False, cbar = False, cmap = "Blues")

In [43]:
df.describe()

In [44]:
# positive = train[train['label']==0]
# negative = train[train['label']==1]

# blob = TextBlob("This is a train")
# blob.tags


## Cleaning Data

In [45]:
# #Creating new dataframe and new features
# df = pd.DataFrame()
# tw_list[“text”] = tw_list[0]
# #Removing RT, Punctuation etc
# remove_rt = lambda x: re.sub(‘RT @\w+: ‘,” “,x)
# rt = lambda x: re.sub(“(@[A-Za-z0–9]+)|([⁰-9A-Za-z \t])|(\w+:\/\/\S+)”,” “,x)
# tw_list[“text”] = tw_list.text.map(remove_rt).map(rt)
# tw_list[“text”] = tw_list.text.str.lower()
# tw_list.head(10)

In [46]:
def gen_freq(message):
    #Will store the list of words
    word_list = []

    #Loop over all the tweets and extract words into word_list
    for tw_words in message.split():
        word_list.extend(tw_words)

    #Create word frequencies using word_list
    word_freq = pd.Series(word_list).value_counts()

    #Print top 20 words
    word_freq[:20]
    
    return word_freq

gen_freq(train.message.str)

In [47]:
#Generate word cloud
wc = WordCloud(width=400, height=330, max_words=100, background_color='white').generate_from_frequencies(gen_freq(train.message.str))

plt.figure(figsize=(12, 8))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [48]:
import re

def clean_text(text):
    #Remove RT
    message = re.sub(r'RT', '', text)
    
    #Fix &
    message = re.sub(r'&amp;', '&', text)
    
    #Remove punctuations
    message = re.sub(r'[?!.;:,#@-]', '', text)

    #Convert to lowercase to maintain consistency
#     text = text.lower()
    return text


In [49]:
from wordcloud import STOPWORDS

text = train.message.apply(lambda message: clean_text(message))
word_freq = gen_freq(train.message.str)*100
word_freq = word_freq.drop(labels=STOPWORDS, errors='ignore')

#Generate word cloud
wc = WordCloud(width=450, height=330, max_words=200, background_color='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(12, 14))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [50]:
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

## Splitting out the X variable from the target

In [51]:
y = train['sentiment']
X = train['message']

## Turning text into something your model can read

In [52]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

In [53]:
# # Create an instance of TfidfVectorizer
# vectoriser = TfidfVectorizer(analyzer=preprocess_text)
# # Fit to the data and transform to feature matrix
# X_train = vectoriser.fit_transform(X_train['speech'])
# # Convert sparse matrix to dataframe
# X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
# # Save mapping on which index refers to which words
# col_map = {v:k for k, v in vectoriser.vocabulary_.items()}
# # Rename each column using the mapping
# for col in X_train.columns:
#     X_train.rename(columns={col: col_map[col]}, inplace=True)
# X_train

In [54]:
# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(X)

## Splitting the training data into a training and validation set

In [55]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.2,shuffle=True, stratify=y, random_state=11)

## Training the model and evaluating using the validation set 

In [56]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_val)

## Checking the performance of our model on the validation set

In [57]:
f1_score(y_val, rfc_pred, average="macro")

## Getting our test set ready 

In [58]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [59]:
y_pred = rfc.predict(test_vect)

In [60]:
test['sentiment'] = y_pred

In [61]:
test.head()

## Creating an output csv for submission

In [62]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)