In [1]:
import pandas as pd

**CREATING INDIVIDUAL DATAFRAMES**

In [2]:
amazon_df = pd.read_csv('amazon_reviews.csv')
twitter_df = pd.read_csv('Tweets.csv')
yelp_df = pd.read_csv('yelp.csv')

In [3]:
print("--------------------------AMAZON---------------------")
print(amazon_df.columns)
print("\n--------------------------TWITTER---------------------")
print(twitter_df.columns)
print("\n--------------------------YELP---------------------")
print(yelp_df.columns)

--------------------------AMAZON---------------------
Index(['Unnamed: 0', 'reviewerName', 'overall', 'reviewText', 'reviewTime',
       'day_diff', 'helpful_yes', 'helpful_no', 'total_vote',
       'score_pos_neg_diff', 'score_average_rating', 'wilson_lower_bound'],
      dtype='object')

--------------------------TWITTER---------------------
Index(['textID', 'text', 'selected_text', 'sentiment'], dtype='object')

--------------------------YELP---------------------
Index(['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id',
       'cool', 'useful', 'funny'],
      dtype='object')


**PERFORMING DATA LOADING BY CHOOSING ONLY REQUIRED COLUMNS ON DATAFRAME**

In [4]:
def data_loading(df,features,label):
    df_reduced = df[features + label]      # + ----> concatanation 
    df_reduced = df_reduced.dropna(axis=0,how='any') #removing rows having nan
    df_reduced.columns = ['review','analysis']
    return df_reduced

EXTRACTING REQUIRED COLUMNS FOR AMAZON REVIEWS

In [5]:
features = ['reviewText']
label = ['overall']
amazon_df_reduced = data_loading(amazon_df,features,label)
amazon_df_reduced.head()

Unnamed: 0,review,analysis
0,No issues.,4.0
1,"Purchased this for my device, it worked as adv...",5.0
2,it works as expected. I should have sprung for...,4.0
3,This think has worked out great.Had a diff. br...,5.0
4,"Bought it with Retail Packaging, arrived legit...",5.0


EXTRACTING REQUIRED COLUMNS FOR TWITTER REVIEWS

In [6]:
features = ['text']
label = ['sentiment']
twitter_df_reduced = data_loading(twitter_df,features,label)
twitter_df_reduced.head()

Unnamed: 0,review,analysis
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


EXTRACTING REQUIRED COLUMNS FOR YELP REVIEWS

In [7]:
features = ['text']
label = ['stars']
yelp_df_reduced = data_loading(yelp_df,features,label)

**DEALING WITH LABELS**

In [8]:
print(amazon_df_reduced['analysis'].unique())
print(twitter_df_reduced['analysis'].unique())
print(yelp_df_reduced['analysis'].unique())

[4. 5. 3. 1. 2.]
['neutral' 'negative' 'positive']
[5 4 2 3 1]


MAPPING LABELS OF AMAZON REVIEW TO ['NEGATIVE','NEUTRAL','POSITIVE]

In [9]:
for x in amazon_df_reduced['analysis'].unique():
    if int(x)<=3:
        #amazon_df_reduced.loc[amazon_df_reduced['analysis']==x,'analysis'] = 'negative'
        amazon_df_reduced.loc[amazon_df_reduced['analysis']==x,'analysis'] = 0
    #elif int(x)==3:
    #    amazon_df_reduced.loc[amazon_df_reduced['analysis']==x,'analysis'] = 'neutral'
    elif int(x)>=4:
        #amazon_df_reduced.loc[amazon_df_reduced['analysis']==x,'analysis'] = 'positive'
        amazon_df_reduced.loc[amazon_df_reduced['analysis']==x,'analysis'] = 1


MAPPING LABELS OF TWITTER REVIEW TO ['NEGATIVE','NEUTRAL','POSITIVE]

In [10]:
for x in twitter_df_reduced['analysis'].unique():
    if x == 'negative':
        #twitter_df_reduced.loc[twitter_df_reduced['analysis']==x,'analysis'] = 'negative'
        twitter_df_reduced.loc[twitter_df_reduced['analysis']==x,'analysis'] = 0
    elif x == 'neutral':
        #twitter_df_reduced.loc[twitter_df_reduced['analysis']==x,'analysis'] = 'neutral'
        twitter_df_reduced.loc[twitter_df_reduced['analysis']==x,'analysis'] = 0
    elif x == 'positive':
        #twitter_df_reduced.loc[twitter_df_reduced['analysis']==x,'analysis'] = 'positive'
        twitter_df_reduced.loc[twitter_df_reduced['analysis']==x,'analysis'] = 1

MAPPING LABELS OF AMAZON YELP TO ['NEGATIVE','NEUTRAL','POSITIVE]

In [11]:
for x in yelp_df_reduced['analysis'].unique():
    if int(x)<=3:
        #yelp_df_reduced.loc[yelp_df_reduced['analysis']==x,'analysis'] = 'negative'
        yelp_df_reduced.loc[yelp_df_reduced['analysis']==x,'analysis'] = 0
    #elif int(x)==3:
    #    yelp_df_reduced.loc[yelp_df_reduced['analysis']==x,'analysis'] = 'neutral'
    elif int(x)>=4:
        #yelp_df_reduced.loc[yelp_df_reduced['analysis']==x,'analysis'] = 'positive'
        yelp_df_reduced.loc[yelp_df_reduced['analysis']==x,'analysis'] = 1

**AGGREGATING DATAFRAMES**

In [12]:
df = pd.concat([amazon_df_reduced,twitter_df_reduced,yelp_df_reduced],axis=0,names=['review','analysis'])

print("Number of classes :",len(df['analysis'].unique()))
print()
print(df.shape[0] == amazon_df_reduced.shape[0]+twitter_df_reduced.shape[0]+yelp_df_reduced.shape[0])
print()
print(df.isna().sum())

Number of classes : 2

True

review      0
analysis    0
dtype: int64


In [13]:
'''X = df['review'].to_frame()
y = df['analysis'].to_frame()
X.shape[0] == y.shape[0]'''

"X = df['review'].to_frame()\ny = df['analysis'].to_frame()\nX.shape[0] == y.shape[0]"

In [14]:
'''X = df['review'].to_frame()
y = df['analysis'].to_frame()
X.shape[0] == y.shape[0]'''

"X = df['review'].to_frame()\ny = df['analysis'].to_frame()\nX.shape[0] == y.shape[0]"

In [15]:
import re
from sklearn.pipeline import Pipeline
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
word_lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")

In [16]:
def preprocess(sentence):
    sentence = re.sub('<[^>]*>','',sentence)
    emoticons = re.findall(':-?[()]',sentence)
    sentence = re.sub('[\W]+|[\d]+',' ',sentence.lower()) + ' '.join(emoticons)
    return sentence

In [17]:
def lemmatizer(sentence):
    '''
    lem_sent = []
    for word in nlp(sentence):
        lem_sent.append(word.lemma_)
    return ''.join(lem_sent)
    '''
    
    return ''.join([word_lemmatizer.lemmatize(word) for word in sentence])

In [18]:
def stop_words_remover(sentence):
    sentence = ''.join(sentence)
    stopwords = nlp.Defaults.stop_words
    new_sent = ''
    for word_token in sentence.split():
        if word_token not in stopwords:
            new_sent = new_sent + word_token + ' '
    return new_sent
    
    '''
    stop_words = stopwords.words('english')
    return ''.join([word for word in sentence if word not in stop_words])
    '''

In [19]:
import numpy as np
X_ = np.array([])
np.append(X_,[['asc']])
np.append(X_,[['cdsf']])
X_.shape

(0,)

In [20]:
from sklearn.base import TransformerMixin,BaseEstimator
class DataCleaner(BaseEstimator,TransformerMixin):
    def __init__(self,X,y):
        self.X = X
        self.y = y
        
    
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        X_ = X.copy()
        for i in range(X_.shape[0]):
            X_.iloc[i] = preprocess(X_.iloc[i])
            X_.iloc[i] = stop_words_remover(X_.iloc[i])
            X_.iloc[i] = lemmatizer(X_.iloc[i])
        return X_

In [21]:
'''
df['review'] = df['review'].apply(preprocess)
df['review'] = df['review'].apply(stop_words_remover)
df['review'] = df['review'].apply(lemmatizer)
'''

"\ndf['review'] = df['review'].apply(preprocess)\ndf['review'] = df['review'].apply(stop_words_remover)\ndf['review'] = df['review'].apply(lemmatizer)\n"

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(lowercase=False,
                        #preprocessor=preprocess,
                        #tokenizer=lemmatizer,
                        #stop_words=nlp.Defaults.stop_words,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True
                        )
                        

In [23]:
X = df['review']
y = df['analysis']
y = y.astype('int')
print(y.unique())

[0 1]


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
X_train, X_test, y_train, y_test = train_test_split(X,y,shuffle=True,test_size=0.5,random_state=39)
clf = LinearSVC()

In [25]:
pipe = Pipeline([
                ('data_cleaning',DataCleaner(X,y)),
                ('vectorizer',tfidf),
                ('classifier',clf)
        ])

In [26]:
X_train.shape[0] == y_train.shape[0]

True

In [27]:
pipe.fit(X_train,y_train)

In [29]:
pipe.score(X_test,y_test)

0.878426192385715

In [37]:
test_val = pd.Series(['This was great'])
pipe.predict(test_val)

array([1, 1])