In [96]:
import pandas as pd 
import numpy as np

In [97]:
# spam data
spam_data = pd.read_csv('category/spam_data.csv')
spam_data.columns
print(len(spam_data))

5572


In [98]:
spam_data = spam_data.loc[spam_data['Category'] == 'spam']
spam_data.rename(columns = {'Message': 'text'}, inplace = True)
print(spam_data.head())
print(len(spam_data))

   Category                                               text
2      spam  Free entry in 2 a wkly comp to win FA Cup fina...
5      spam  FreeMsg Hey there darling it's been 3 week's n...
8      spam  WINNER!! As a valued network customer you have...
9      spam  Had your mobile 11 months or more? U R entitle...
11     spam  SIX chances to win CASH! From 100 to 20,000 po...
747


In [99]:
spam_data.loc[spam_data['Category'] == 'spam', 'Category'] = 0
spam_data.head()

Unnamed: 0,Category,text
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
5,0,FreeMsg Hey there darling it's been 3 week's n...
8,0,WINNER!! As a valued network customer you have...
9,0,Had your mobile 11 months or more? U R entitle...
11,0,"SIX chances to win CASH! From 100 to 20,000 po..."


In [100]:
#  complaint data
complaint_data = pd.read_csv('category/complaint_data.csv')
complaint_data.columns

Index(['author', 'posted_on', 'rating', 'text'], dtype='object')

In [101]:
complaint_data = complaint_data.drop(['author', 'posted_on'],axis=1)
complaint_data.head()

Unnamed: 0,rating,text
0,1,I used to love Comcast. Until all these consta...
1,1,I'm so over Comcast! The worst internet provid...
2,1,If I could give them a negative star or no sta...
3,1,I've had the worst experiences so far since in...
4,1,Check your contract when you sign up for Comca...


In [102]:
complaint_data = complaint_data.loc[complaint_data['rating'] == 0]
complaint_data.loc[complaint_data['rating'] == 0, 'rating'] = 1
complaint_data.rename(columns = {'rating': 'Category'}, inplace = True)
print(complaint_data.head())
print(len(complaint_data))

      Category                                               text
1057         1  Comcast Internet high-speed is not high-speed ...
1062         1  I couldn't get online in the internet although...
1063         1  I am unable to access the internet on one of m...
1064         1  I disconnected Comcast May 2nd 2011.  I was to...
1065         1  I am being charged for services when my servic...
1560


In [103]:
#  review data
review_data = pd.read_csv('category/review_data.csv')
review_data.columns

Index(['rating', 'date', 'variation', 'verified_reviews', 'feedback'], dtype='object')

In [104]:
review_data = review_data.drop(['rating', 'date', 'variation'],axis=1)
review_data.head()

Unnamed: 0,verified_reviews,feedback
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [105]:
review_data = review_data.loc[review_data['feedback'] == 1]
review_data.loc[review_data['feedback'] == 1,'feedback'] = 2
review_data.rename(columns = {'verified_reviews': 'text'}, inplace = True)
review_data.rename(columns = {'feedback': 'Category'}, inplace = True)
review_data = review_data[:1500]
print(review_data.head())
print(len(review_data))

                                                text  Category
0                                      Love my Echo!         2
1                                          Loved it!         2
2  Sometimes while playing a game, you can answer...         2
3  I have had a lot of fun with this thing. My 4 ...         2
4                                              Music         2
1500


In [106]:
#Combine all dataframes into one master dataframe
data = pd.concat([spam_data, complaint_data, review_data], ignore_index = True)
print(data.head())
print(len(data))

  Category                                               text
0        0  Free entry in 2 a wkly comp to win FA Cup fina...
1        0  FreeMsg Hey there darling it's been 3 week's n...
2        0  WINNER!! As a valued network customer you have...
3        0  Had your mobile 11 months or more? U R entitle...
4        0  SIX chances to win CASH! From 100 to 20,000 po...
3807


In [107]:
# Preprocessing

## Drop missing value
data = data.dropna(axis = 0, how ='any')
len(data)

3777

In [136]:
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter
import string
import spacy

# Lemmatization
def lemmetization(doc):
    words = []
    for word in doc:
        if word.lemma_ != "-PRON-":
            temp = word.lemma_.lower().strip()
        else:
            temp = word.lower_

        words.append(temp)

    return words

# Remove punctuation and stopwords
def punc_removal(doc):
    words = []
    for word in doc:
        if word not in list(STOP_WORDS) and word not in string.punctuation:
            words.append(word)
    
    return words

save_words = []
def text_cleaning_english(text,mynlp):

    doc = mynlp(text)
    # lemmetization
    words = lemmetization(doc)
    # remove punctuation
    words = punc_removal(words)

    # save all words in the data file
    for word in words:
        save_words.append(word)

    # traverse in the string     
    complete_sentence = ' '.join([str(word) for word in words])
    
    return complete_sentence

def most_used_words():
    list = []
    common_words = Counter(save_words).most_common(5)
    for key, value in common_words:
        list.append({'category': key, 'value': value})

    return list

In [137]:
nlp = spacy.load("en_core_web_sm")
text_cleaning = lambda x: text_cleaning_english(x,mynlp=nlp)
data['cleaned_text'] = pd.DataFrame(data['text'].apply(text_cleaning))

common_words = most_used_words()
print(common_words)

[{'category': 'comcast', 'value': 4853}, {'category': 'service', 'value': 4413}, {'category': 'tell', 'value': 2492}, {'category': 'time', 'value': 2082}, {'category': 'cable', 'value': 1886}]


In [144]:
## Drop missing value
data['cleaned_text'].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
3802    False
3803    False
3804    False
3805    False
3806    False
Name: cleaned_text, Length: 3777, dtype: bool

In [139]:
# SPLIT TRAINING & TESTING DATA
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'],data['Category'],test_size=0.2,shuffle=True, random_state=42)
print(X_train.shape, y_train.shape)

(3021,) (3021,)


In [140]:
X_test

3560    https://www.amazon.com/dp/b073sqyxtw/ref=cm_cr...
999     comcast new site format push user throat choic...
3053    4.5 5 stars generally like product great ask l...
1561    type 132,78 online bill payment instead 132.78...
2458       met exceed expectation easy set quick response
                              ...                        
1686    clear customer service priority clear comacast...
764     kenneth president fairly large electrical cont...
2309    play game answer question correctly alexa wron...
3211                            work great perfect office
3575    easy use dot pick voice far away speaker lot c...
Name: cleaned_text, Length: 756, dtype: object

In [141]:
y_test.value_counts()

1    303
2    284
0    169
Name: Category, dtype: int64

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC 
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

def sentiment_pipeline(data_train_input,data_train_target,model_type):
    # Classifier selection
    if model_type == "linear":
        classifier = LinearSVC()
    elif model_type == "logistic":
        classifier = LogisticRegression(max_iter=1000)
    elif model_type == "sgd":
        classifier = SGDClassifier()
    elif model_type == "naive_bayes":
        classifier = MultinomialNB()
    elif model_type == "xgboost":
        classifier = XGBClassifier(use_label_encoder=False,eta=0.1,gamma=0.3, n_estimators=100, learning_rate=0.5, min_child_weight=5, 
        max_depth=5, colsample_bytree=0.7,objective="multi:softmax", eval_metric="mlogloss",verbosity=0)

    tfidf = TfidfVectorizer()

    # Pipeline setup
    clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

    model = clf.fit(data_train_input,data_train_target)

    return model

def sentiment_model_predict(model,data_test_input,data_test_target):
    data_prediction=model.predict(data_test_input)
    conf_matrix = confusion_matrix(data_test_target,data_prediction)
    acc_score = accuracy_score(data_test_target, data_prediction)
    pre_score = precision_score(data_test_target, data_prediction, average="macro")
    re_score = recall_score(data_test_target, data_prediction, average="macro")
    f_score = f1_score(data_test_target, data_prediction, average="macro")

    print("Accuracy : "+str(round(acc_score*100,2)))
    print("Precision : "+str(round(pre_score*100,2)))
    print("Recall : "+str(round(re_score*100,2)))
    print("F1-Score :"+str(round(f_score*100,2)))
    print(conf_matrix)

In [2]:
# Xgboost
xg_model = sentiment_pipeline(X_train, y_train, 'xgboost')
sentiment_model_predict(xg_model,X_test,y_test)

NameError: name 'X_train' is not defined