## Import the necessary libraries

In [3]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from imblearn.under_sampling import (RandomUnderSampler, 
                                    NearMiss, 
                                    InstanceHardnessThreshold,
                                    CondensedNearestNeighbour,
                                    EditedNearestNeighbours,
                                    RepeatedEditedNearestNeighbours,
                                    AllKNN,
                                    NeighbourhoodCleaningRule,
                                    OneSidedSelection,
                                    TomekLinks)
from bs4 import BeautifulSoup             
import re
from nltk.stem import PorterStemmer
porter_stemmer=PorterStemmer()

## Load in your data from kaggle.  
By working in a kaggle kernel, you can access the data directly from the competition, as well as make your submission without downloading your output file

In [4]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

In [5]:
test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [6]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [7]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

## Splitting out the X variable from the target

In [8]:
y = train['sentiment']
X = train['message']

In [9]:
X

0        PolySciMajor EPA chief doesn't think carbon di...
1        It's not like we lack evidence of anthropogeni...
2        RT @RawStory: Researchers say we have three ye...
3        #TodayinMaker# WIRED : 2016 was a pivotal year...
4        RT @SoyNovioDeTodas: It's 2016, and a racist, ...
                               ...                        
15814    RT @ezlusztig: They took down the material on ...
15815    RT @washingtonpost: How climate change could b...
15816    notiven: RT: nytimesworld :What does Trump act...
15817    RT @sara8smiles: Hey liberals the climate chan...
15818    RT @Chet_Cannon: .@kurteichenwald's 'climate c...
Name: message, Length: 15819, dtype: object

## Resampling

In [26]:
#from sklearn.utils import resample
#positive = train[train['sentiment'] == 1]
#negative = train[train['sentiment'] == -1]
#neutral = train[train['sentiment'] == 0] 
#other = train[train['sentiment'] == 2]

#negative_upsampled = resample(positive, replace=True, n_samples=len(other), random_state=27)
#neutral_upsampled = resample(neutral, replace=True, n_samples=len(other), random_state=27)
#unk_upsampled = resample(unk, replace=True, n_samples=len(other), random_state=27)

#U_train = pd.concat([positive, negative_upsampled, neutral_upsampled, unk_upsampled])

In [27]:
#U_train.info()

In [48]:
def tweettoword(tweet): 
    t1 = BeautifulSoup(tweet).get_text()  # Remove HTML   
    letters = re.sub("[^a-zA-Z]", " ", t1) # get letters
    sp_char= re.sub("\\W"," ",t1) # remove special chars
    words = letters.lower().split()  # convert to lowercase and split into words                                        
    #stops = stopwords.words("english")                
    #useful_words = [w for w in words if not w in stops]
    #useful_words1 = " ".join(useful_words) #Join to useful words
    stemmed_words=[porter_stemmer.stem(word=word) for word in words] #

    return " ".join(stemmed_words) #Join to useful words

In [29]:
#from nltk.tokenize import TweetTokenizer
#tknzr = TweetTokenizer()
#trr = train
#tknzr.tokenize(s0)

In [49]:
def tknzr(text):
    # create a space between special characters 
    text=re.sub("(\\W)"," \\1 ",text)
    # split based on whitespace
    return re.split("\\s+",text)

In [31]:
#y = U_train['sentiment']
#X = U_train['message']

## Turning text into something your model can read

In [32]:
#vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
#X_vectorized = vectorizer.fit_transform(X)

In [63]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),
                             tokenizer=tknzr, 
                             min_df=2,
                             max_df=0.80,
                             analyzer='word',
                             smooth_idf=False, 
                             preprocessor=tweettoword,
                             token_pattern=r'\w{1,}', 
                             max_features=100000,
                             stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

  'stop_words.' % sorted(inconsistent))


In [14]:
X_vectorized

<15819x24282 sparse matrix of type '<class 'numpy.float64'>'
	with 266216 stored elements in Compressed Sparse Row format>

## Splitting the training data into a training and validation set

In [35]:
#X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.3,shuffle=True, stratify=y, random_state=11)

In [97]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,
                                               y,
                                               test_size=0.20,
                                               shuffle=True, 
                                               random_state=32)

## Training the model and evaluating using the validation set 

In [93]:
sampler =AllKNN(allow_minority=True)
enn_xtrain_tfidf, enn_train_y = sampler.fit_sample(X_train, y_train)

In [41]:
#rfc = RandomForestClassifier()
#rfc.fit(X_train, y_train)
#rfc_pred = rfc.predict(X_val)

In [98]:
lsvc = LinearSVC()
lsvc.fit(X_train, y_train)
lsvc_pred = lsvc.predict(X_val)

## Checking the performance of our model on the validation set

In [99]:
f1_score(y_val, lsvc_pred, average="macro")

0.6481932313304015

In [91]:
from sklearn import metrics

print(metrics.classification_report(y_val, lsvc_pred))

              precision    recall  f1-score   support

          -1       0.70      0.41      0.52       383
           0       0.57      0.41      0.48       674
           1       0.77      0.87      0.81      2614
           2       0.75      0.75      0.75      1075

    accuracy                           0.74      4746
   macro avg       0.70      0.61      0.64      4746
weighted avg       0.73      0.74      0.73      4746



## Getting our test set ready 

In [82]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [31]:
#y_pred = rfc.predict(test_vect)

In [83]:
y_pred = lsvc.predict(test_vect)

In [84]:
test['sentiment'] = y_pred

In [85]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,0
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


## Creating an output csv for submission

In [86]:
test[['tweetid','sentiment']].to_csv('KD_SUB_14.csv', index=False)