In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('tweet_emotions.csv', encoding='ISO_8859_1')

df

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [5]:
class TextNormalizer(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([token.lower()
                                 for token in word_tokenize(X_copy[i])])
        return X_copy

In [6]:
norm = TextNormalizer()

In [7]:
norm.fit_transform (["Hellow, my name, children"])

['hellow , my name , children']

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
class WordExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words):
        self.stop_words = stop_words
    def fit(self, X, y=None, **fit_params):
        self.general_freq = FreqDist()
        for document in X:
            tokens = word_tokenize(document,language = "russian")
            freq = FreqDist(tokens)
            self.general_freq.update(freq)
        self.hapaxes = self.general_freq.hapaxes()
        return self
    def transform(self, X, y=None, **fit_params):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([token for token in word_tokenize(X[i])
                                 if token not in self.hapaxes and
                                 token not in self.stop_words])
        return X_copy

In [10]:
stop_words = stopwords.words("english")

In [11]:
word_extractor = WordExtractor(stop_words)

In [12]:
corpus = [
    'Children drink milk',
     'I love milk',
      'My children play together',
    'It-s fun to play together'
]

In [13]:
word_extractor.fit_transform(corpus)

['milk', 'milk', 'play together', 'play together']

In [14]:
class ApplyStemmer(BaseEstimator, TransformerMixin):
    def __init__(self, stemmer):
        self.stemmer = stemmer
    
    def fit(self, X, y=None, **fit_tranform):
        return self
    
    def transform(self, X, y=None, **fit_tranform):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([self.stemmer.stem(token) 
                                  for token in word_tokenize(X_copy[i])])
        return X_copy

In [15]:
from nltk.stem.snowball import SnowballStemmer

In [16]:
snowball_stemmer = SnowballStemmer("russian")

In [19]:
apply_stemmer=ApplyStemmer(snowball_stemmer)

In [20]:
apply_stemmer.fit_transform(['Drink milk, children, you will be healthy!'])

['Drink milk , children , you will be healthy !']

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [23]:
pipe = Pipeline([
   ("norm", TextNormalizer()),
    ("extractor", WordExtractor(stop_words)),
    ("stemmer", ApplyStemmer(PorterStemmer())),
    ("vectorizer", TfidfVectorizer()),
    ("logic", LogisticRegression(max_iter=1000))
    
])

In [24]:
X = df['content'].values
y = df['sentiment'].values 

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [26]:
pipe.fit(X_train, y_train)

In [28]:
wypred=pipe.predict (X_test)

In [29]:
accuracy_score (wypred,y_test)

0.3423

In [30]:
import pickle

In [33]:
pickle.dump (pipe,open ("pipe.pkl","wb"))