In [1]:
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import re
import pandas as pd
import pickle
import nltk
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG)

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer

nltk.download('stopwords')

stemmer = SnowballStemmer("english", ignore_stopwords=True)
token = RegexpTokenizer(r'[a-zA-Z0-9]+')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Enes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

DATASET_ENCODING = "ISO-8859-1"

dataset = pd.read_csv('./Corona_NLP_train.csv', delimiter=',', encoding=DATASET_ENCODING)

token = RegexpTokenizer(r'[a-zA-Z0-9]+')

dataset.head()

def preprocess_tweets(tweet):
    tweet = p.clean(tweet)
    tokens = tweet.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

X = dataset['OriginalTweet']
X


0        @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...
1        advice Talk to your neighbours family to excha...
2        Coronavirus Australia: Woolworths to give elde...
3        My food stock is not the only one which is emp...
4        Me, ready to go at supermarket during the #COV...
                               ...                        
41152    Airline pilots offering to stock supermarket s...
41153    Response to complaint not provided citing COVI...
41154    You know itÂs getting tough when @KameronWild...
41155    Is it wrong that the smell of hand sanitizer i...
41156    @TartiiCat Well new/used Rift S are going for ...
Name: OriginalTweet, Length: 41157, dtype: object

In [3]:


X = X.apply(preprocess_tweets)

y = dataset['Sentiment']

X


0                                                  and and
1        advic talk to your neighbour famili to exchang...
2        coronavirus australia: woolworth to give elder...
3        my food stock is not the only one which is emp...
4        me, readi to go at supermarket during the outb...
                               ...                        
41152    airlin pilot offer to stock supermarket shelv ...
41153    respons to complaint not provid cite covid-19 ...
41154    you know itâ get tough when is ration toilet ...
41155    is it wrong that the smell of hand sanit is st...
41156    well new/us rift s are go for $700.00 on amazo...
Name: OriginalTweet, Length: 41157, dtype: object

In [4]:

y
# X.shape


0                   Neutral
1                  Positive
2                  Positive
3                  Positive
4        Extremely Negative
                ...        
41152               Neutral
41153    Extremely Negative
41154              Positive
41155               Neutral
41156              Negative
Name: Sentiment, Length: 41157, dtype: object

In [29]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

# X_train.shape, X_test.shape

# creating our pipeline that will return an estimator
pipeline = Pipeline([('tfidf', TfidfVectorizer(max_features=20000, ngram_range=(1,2), tokenizer=token.tokenize)), ('clf', SVC(probability=True))])

pipeline.fit(X_train, y_train)


In [None]:

y_pred = pipeline.predict(X_test)


In [None]:

# print(pipeline.predict_proba(sub_main))
print(classification_report(y_test, y_pred))

# test_tweet = "scandinavia #news:  norway : it's illegal for employers to require covid  passports  denmark\
#     sweden : they won't be bringing in covid  vaccination passports  #holdtheline #enoughisenough #nomedicalapartheid #nomasks #nomorelockdowns #openforall #corona #coronavirus"
# # test_tweet2 = "everyone should get vaccinated as soon as possible"
# vector = tfidf.transform([test_tweet])

# print(svc.predict(vector))

acc = int(accuracy_score(y_test, y_pred)*100)

# exporting the model and the trained vectorizer
pickle.dump(svc, open(f'./models/SVC_model_{acc}', 'wb'))
pickle.dump(tfidf, open(f'./vector/tfidf_vectorizer_{acc}', 'wb'))