In [None]:
import numpy as np
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [24]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import re
import string
from sklearn.metrics import accuracy_score


In [25]:
test_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv', header=None)
train_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv', header=None)
test_df.reset_index(drop=True,inplace=True)
train_df.reset_index(drop=True,inplace=True)
df = pd.concat([train_df,test_df], axis=0)
df.drop([0], axis=1, inplace=True)
df.columns = ['platform','sentiment','text']
df.drop(['platform'], axis=1, inplace=True)
df.sentiment = df.sentiment.map({"Neutral":0, "Irrelevant":0 ,"Positive":1,"Negative":2})
df.dropna(inplace=True)

In [None]:
df

In [None]:
stop_words = set(stopwords.words('english'))
def data_preprocessing(text):
    text = text.lower()
    text = re.sub('<.*?>', '', text) 
    text = ''.join([c for c in text if c not in string.punctuation])
    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    return text

df['text'] = df['text'].astype(str).apply(data_preprocessing)

df.head()

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df.text, df.sentiment, test_size=0.2, random_state=1)

In [29]:
y_train = y_train.astype('int')
y_test= y_test.astype('int')

In [30]:
print(X_train.shape)
print(X_test.shape)

(59996,)
(15000,)


## TF-IDF

In [31]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)

tfv.fit(list(X_train) + list(X_test))
xtrain_tfv =  tfv.transform(X_train) 
xvalid_tfv = tfv.transform(X_test)



## Count Vectorizer

In [32]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3))


ctv.fit(list(X_train) + list(X_test))
xtrain_ctv =  ctv.transform(X_train) 
xvalid_ctv = ctv.transform(X_test)

## GloVe

In [33]:
glove_path = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt'

In [34]:
def load_word_embeddings(file=glove_path):
    embeddings={}
    with open(file,'r') as infile:
        for line in infile:
            values=line.split()
            embeddings[values[0]]=np.asarray(values[1:],dtype='float32')
    return embeddings

In [None]:
glove_embeddings = load_word_embeddings()

In [None]:
def sentence_features_v2(s, embeddings=glove_embeddings,emb_size=200):
    # ignore stop words
    words=s
    words=[w for w in words if w.isalpha() and w in embeddings]
    if len(words)==0:
        return np.hstack([np.zeros(emb_size)])
    M=np.array([embeddings[w] for w in words])
    return M.mean(axis=0)

In [None]:
train_glove = np.array([sentence_features_v2(x) for x in X_train])
test_glove = np.array([sentence_features_v2(x) for x in X_test])

In [None]:
train_glove.shape

In [None]:
X_train.shape

## Logistic Regression

In [None]:
clf = LogisticRegression(random_state=0, multi_class='multinomial')
clf.fit(xtrain_tfv, y_train)

In [None]:
val_pred = clf.predict(xvalid_tfv)
print("The accurary of logistic regression with tf-idf embedding is",accuracy_score(list(y_test), val_pred))

In [None]:
clf = LogisticRegression(random_state=0, multi_class='multinomial')
clf.fit(xtrain_ctv, y_train)
val_pred = clf.predict(xvalid_ctv)
print("The accurary of logistic regression with count-vectorizer embedding is",accuracy_score(list(y_test), val_pred))

In [None]:
predict=pd.DataFrame(val_pred)


In [None]:
predict_file=predict.to_csv("submission.csv", index=False)