## Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from sklearn.utils import resample

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load in your data from kaggle.  
By working in a kaggle kernel, you can access the data directly from the competition, as well as make your submission without downloading your output file

In [None]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

In [None]:
train.head()

In [None]:
train.sentiment.value_counts()

# Cleaning the Data

In [None]:
stop_words = set(stopwords.words('english'))

def preprocess_message_text(tweet):
    tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#','', tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    return " ".join(filtered_words) 

    

In [None]:
train['message'] = train['message'].apply(preprocess_message_text)

In [None]:
positive = train[train['sentiment'] == 1]
negative = train[train['sentiment'] == -1]
neutral = train[train['sentiment'] == 0]
news = train[train['sentiment'] == 2]

negative_upsampled = resample(negative,replace=True,n_samples=len(positive),random_state = 27)

neutral_upsampled = resample(neutral,replace=True,n_samples=len(positive),random_state = 27)

news_upsampled = resample(news,replace=True,n_samples=len(positive),random_state = 27)

upsampled = pd.concat([positive,negative_upsampled,neutral_upsampled,news_upsampled])


In [None]:
upsampled.sentiment.value_counts()

## Splitting out the X variable from the target

In [None]:
y = upsampled['sentiment']
X = upsampled['message']

## Turning text into something your model can read

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

## Splitting the training data into a training and validation set

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.3,shuffle=True, stratify=y, random_state=11)

## Training the model and evaluating using the validation set 

In [None]:
NB = MultinomialNB()
NB.fit(X_train, y_train)
NB_pred = NB.predict(X_val)

## Checking the performance of our model on the validation set

In [None]:
f1_score(y_val, NB_pred, average="macro")

## Getting our test set ready 

In [None]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [None]:
y_pred = NB.predict(test_vect)

In [None]:
test['sentiment'] = y_pred

In [None]:
test.head()

In [None]:
test.sentiment.value_counts()

## Creating an output csv for submission

In [None]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)