## Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load in your data from kaggle.  
By working in a kaggle kernel, you can access the data directly from the competition, as well as make your submission without downloading your output file

In [None]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

In [None]:
train.head()

In [None]:
train.sentiment.value_counts().plot(kind = 'bar')
plt.show()

In [None]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
train['message'] = train['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [None]:
from nltk.tokenize import word_tokenize, TreebankWordTokenizer

In [None]:
tokeniser = TreebankWordTokenizer()
train['tokens'] = train['message'].apply(tokeniser.tokenize)

In [None]:
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer
stemmer = SnowballStemmer('english')

In [None]:
def train_stemmer(words, stemmer):
    return [stemmer.stem(word) for word in words]

In [None]:
train['stem'] = train['tokens'].apply(train_stemmer, args=(stemmer, ))

In [None]:
lemmatizer = WordNetLemmatizer()
def train_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]   

In [None]:
train['lemma'] = train['tokens'].apply(train_lemma, args=(lemmatizer, ))

In [None]:
train.sentiment.value_counts()

## Splitting out the X variable from the target

In [None]:
y = train['sentiment']
X = train['message']

## Turning text into something your model can read

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [None]:

X_train_smote,y_train_smote = smote.fit_sample(X_vectorized,y)


## Splitting the training data into a training and validation set

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X_train_smote,y_train_smote,test_size=.3,shuffle=True, stratify=y_train_smote, random_state=11)

## Training the model and evaluating using the validation set 

In [None]:
parameters = {'kernel':('linear', 'rbf'), 
              'C':(0.25,1.0),
              'gamma': (1,2)}

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svm = SVC()
clf = GridSearchCV(svm, parameters)
clf.fit(X_train,y_train)

In [None]:
y_pred = clf.predict(X_val)

## Checking the performance of our model on the validation set

In [None]:
f1_score(y_val, y_pred, average="macro")

## Getting our test set ready 

In [None]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
test['message'] = test['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [None]:
tokeniser = TreebankWordTokenizer()
test['tokens'] = test['message'].apply(tokeniser.tokenize)

In [None]:
test['stem'] = test['tokens'].apply(train_stemmer, args=(stemmer, ))

In [None]:
test['lemma'] = test['tokens'].apply(train_lemma, args=(lemmatizer, ))

In [None]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [None]:
y_pred = svc.predict(test_vect)

In [None]:
test['sentiment'] = y_pred

In [None]:
test.head()

## Creating an output csv for submission

In [None]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)