In [1]:
import pandas as pd

In [2]:
blog = pd.read_csv("blogtext.csv")
#data = pd.read_csv("blogtext.csv", nrows = 70000,index_col=False)

In [3]:
blog.shape

(681284, 7)

In [4]:
blog.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [5]:
blog['topic'].value_counts(normalize=True)*100

indUnk                     36.844400
Student                    22.590139
Technology                  6.172903
Arts                        4.762918
Education                   4.349581
Communications-Media        2.956183
Internet                    2.349387
Non-Profit                  2.157690
Engineering                 1.710447
Law                         1.326906
Publishing                  1.137998
Science                     1.066956
Government                  1.013821
Consulting                  0.860434
Religion                    0.768402
Fashion                     0.712038
Marketing                   0.700002
Advertising                 0.686351
BusinessServices            0.660517
Banking                     0.594319
Chemicals                   0.576558
Telecommunications          0.571127
Accounting                  0.562467
Military                    0.459133
Museums-Libraries           0.454436
Sports-Recreation           0.445923
HumanResources              0.441813
R

Due to class imbalance, working with the more popular topics

In [6]:
top_topics = ['indUnk', 'Student', 'Technology', 'Arts', 'Education']

In [7]:
#test = test[~test['Cause'].isin(df_exclude)]
blog_top_topics = blog[blog['topic'].isin(top_topics)]

In [8]:
blog_top_topics['topic'].value_counts(normalize=True)*100

indUnk        49.309996
Student       30.233079
Technology     8.261386
Arts           6.374360
Education      5.821178
Name: topic, dtype: float64

Dropping Id and Date columns

In [9]:
blog_top_topics.drop(labels=['id','date'], axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [10]:
blog_top_topics.head(5)

Unnamed: 0,gender,age,topic,sign,text
0,male,15,Student,Leo,"Info has been found (+/- 100 pages,..."
1,male,15,Student,Leo,These are the team members: Drewe...
2,male,15,Student,Leo,In het kader van kernfusie op aarde...
3,male,15,Student,Leo,testing!!! testing!!!
74,female,14,indUnk,Aries,O= optimist P= pessimist My...


In [11]:
blog_top_topics.shape

(509055, 5)

Randomizing the data

In [12]:
blog_top_topics = blog_top_topics.sample(frac=1).reset_index(drop=True)

In [13]:
blog_top_topics.head(5)

Unnamed: 0,gender,age,topic,sign,text
0,female,17,indUnk,Sagittarius,Okay so maybe I lied and I can't go...
1,male,16,Student,Libra,American Idol. This has got to be m...
2,female,15,Student,Virgo,Hello everybody~! I duno how to desig...
3,female,48,indUnk,Pisces,i have been informed that mamma is not ...
4,male,27,Technology,Aquarius,This URL explains the major differences b...


Pre Processing

- Removal of stopwords
- Removal of unwanted characters/punctuations
- Removal of unwanted space
- Conversion to lowercase
- Stemming/Lemmatization

In [34]:
df = blog_top_topics[["text"]]

In [35]:
df[:5]

Unnamed: 0,text
0,A week from Thursday my father is under...
1,I really do hate chosing. I really...
2,I'm almost done with my book. I'm...
3,My ISP does some real aggressive spam f...
4,...and when i realized that there was n...


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509055 entries, 0 to 509054
Data columns (total 1 columns):
text    509055 non-null object
dtypes: object(1)
memory usage: 3.9+ MB


In [37]:
import re
def remove_trailing(text):
    trailing_space = re.compile(r'\\t|\\n|\\r|\t|\n|\r')
    text = trailing_space.sub(r' ', text)   
    return text

def remove_extra_white_space(text):
    trailing_space = re.compile(r'\s\s+')
    text = trailing_space.sub(r' ', text)
    return text
    
    

def cleaning_process(text):
    text = text.lower()
    remove_trailing_text = remove_trailing(text)
    final_text = remove_extra_white_space(remove_trailing_text)
    return final_text

In [38]:
df['text'] = df['text'].apply(cleaning_process)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [39]:
#spaCy Code Initialization:
import spacy
nlp = spacy.load('en_core_web_sm')
#NLTK Code Initialization:
import nltk
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *


def spacy_process(text):
    doc = nlp(text) # "how are you"
#Tokenization and lemmatization are done with the spacy nlp pipeline commands
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
#     print("Tokenize+Lemmatize:")
#     print(lemma_list)
    
    #Filter the stopword
    filtered_sentence =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    
    #Remove punctuation
    punctuations="?:!.,;"
    for word in filtered_sentence:
        if word in punctuations:
            filtered_sentence.remove(word)
#     print(" ")
#     print("Remove stopword & punctuation: ")
#     print(filtered_sentence)
    return filtered_sentence # 

In [40]:
df['normalized_text'] = df['text'].apply(spacy_process)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [41]:
df.to_csv("normalized_text1.csv") #backup after lemmatization

In [14]:
df = pd.read_csv("normalized_text1.csv", index_col=False)

In [15]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,text,normalized_text
0,0,a week from thursday my father is undergoing ...,"[' ', 'week', 'thursday', 'father', 'undergo',..."
1,1,i really do hate chosing. i really hate how i...,"[' ', 'hate', 'chosing', 'hate', 'chose', 'uni..."
2,2,i'm almost done with my book. i'm on page 824...,"[' ', 'book', 'page', '824', 'problem', 'find'..."
3,3,my isp does some real aggressive spam filteri...,"[' ', 'isp', 'real', 'aggressive', 'spam', 'fi..."
4,4,...and when i realized that there was no scen...,"[' ', '...', 'realize', 'scent', 'picture', '2..."


Merging columns to prepare composite target classes

In [16]:
blog_top_topics["age"] = blog_top_topics["age"].astype(str)

In [17]:
#blog_top_topics['legend']=blog_top_topics['gender']+" -> " + blog_top_topics['age'] +" -> "+ blog_top_topics['sign'] +" -> "+ blog_top_topics['topic']

In [18]:
#blog_top_topics['legend']=blog_top_topics.apply(lambda col: [col['gender'],str(col['age']),col['topic'],col['sign']], axis=1)
blog_top_topics['legend']=blog_top_topics.apply(lambda col: [col['gender'],col['age'],col['topic'],col['sign']], axis=1)

In [19]:
blog_top_topics.head(5)

Unnamed: 0,gender,age,topic,sign,text,legend
0,female,17,indUnk,Sagittarius,Okay so maybe I lied and I can't go...,"[female, 17, indUnk, Sagittarius]"
1,male,16,Student,Libra,American Idol. This has got to be m...,"[male, 16, Student, Libra]"
2,female,15,Student,Virgo,Hello everybody~! I duno how to desig...,"[female, 15, Student, Virgo]"
3,female,48,indUnk,Pisces,i have been informed that mamma is not ...,"[female, 48, indUnk, Pisces]"
4,male,27,Technology,Aquarius,This URL explains the major differences b...,"[male, 27, Technology, Aquarius]"


In [20]:
blog_normalized = pd.concat((df['normalized_text'], blog_top_topics['legend']), axis=1)

In [21]:
blog_normalized.head(), blog_normalized.shape

(                                     normalized_text  \
 0  [' ', 'week', 'thursday', 'father', 'undergo',...   
 1  [' ', 'hate', 'chosing', 'hate', 'chose', 'uni...   
 2  [' ', 'book', 'page', '824', 'problem', 'find'...   
 3  [' ', 'isp', 'real', 'aggressive', 'spam', 'fi...   
 4  [' ', '...', 'realize', 'scent', 'picture', '2...   
 
                               legend  
 0  [female, 17, indUnk, Sagittarius]  
 1         [male, 16, Student, Libra]  
 2       [female, 15, Student, Virgo]  
 3       [female, 48, indUnk, Pisces]  
 4   [male, 27, Technology, Aquarius]  , (509055, 2))

In [22]:
#blog_normalized.to_csv("blog_normalized2.csv") #backup after label and text preprocessing

Label encoding composite Target column

In [23]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# blog_normalized['legend'] = le.fit_transform(blog_normalized['legend'].values)

In [24]:
blog_top_topics.legend.values

array([list(['female', '17', 'indUnk', 'Sagittarius']),
       list(['male', '16', 'Student', 'Libra']),
       list(['female', '15', 'Student', 'Virgo']), ...,
       list(['female', '43', 'Arts', 'Gemini']),
       list(['male', '16', 'Student', 'Gemini']),
       list(['female', '16', 'Student', 'Aries'])], dtype=object)

In [25]:
label_counts=dict()

for labels in blog_top_topics.legend.values:
    for label in labels:
        if label in label_counts:
            label_counts[label]+=1
        else:
            label_counts[label]=1

In [26]:
label_counts

{'female': 263854,
 '17': 68769,
 'indUnk': 251015,
 'Sagittarius': 36710,
 'male': 245201,
 '16': 67226,
 'Student': 153903,
 'Libra': 47689,
 '15': 38151,
 'Virgo': 46007,
 '48': 2354,
 'Pisces': 38801,
 '27': 31239,
 'Technology': 42055,
 'Aquarius': 36586,
 '25': 43458,
 'Gemini': 37770,
 '34': 17570,
 'Cancer': 48994,
 '42': 1944,
 '33': 9566,
 'Taurus': 47933,
 '24': 58544,
 'Aries': 48575,
 '36': 7176,
 '35': 13330,
 'Arts': 32449,
 'Scorpio': 42353,
 '23': 49952,
 'Capricorn': 36670,
 '38': 6140,
 'Education': 29633,
 'Leo': 40967,
 '26': 33223,
 '14': 25309,
 '40': 3360,
 '43': 2838,
 '44': 998,
 '47': 994,
 '13': 11592,
 '41': 2699,
 '39': 3853,
 '37': 5586,
 '46': 836,
 '45': 2348}

In [27]:
sorted(label_counts.keys())

['13',
 '14',
 '15',
 '16',
 '17',
 '23',
 '24',
 '25',
 '26',
 '27',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 'Aquarius',
 'Aries',
 'Arts',
 'Cancer',
 'Capricorn',
 'Education',
 'Gemini',
 'Leo',
 'Libra',
 'Pisces',
 'Sagittarius',
 'Scorpio',
 'Student',
 'Taurus',
 'Technology',
 'Virgo',
 'female',
 'indUnk',
 'male']

In [28]:
from sklearn.preprocessing import MultiLabelBinarizer
binarizer=MultiLabelBinarizer(classes=sorted(label_counts.keys()))
Y = binarizer.fit_transform(blog_top_topics.legend)

In [36]:
Y.shape, blog_normalized['legend'].shape

((509055, 45), (509055,))

Normalized Text Data with merged Labels

In [37]:
#blog_normalized = pd.read_csv("blog_normalized.csv", nrows = 70000, index_col=0)
#blog_normalized = pd.read_csv("blog_normalized.csv", index_col=0)

In [38]:
blog_normalized.head()

Unnamed: 0,normalized_text,legend,normalized_text_merged
0,"[' ', 'week', 'thursday', 'father', 'undergo',...","[female, 17, indUnk, Sagittarius]",week thursday father undergo 2 operation fix...
1,"[' ', 'hate', 'chosing', 'hate', 'chose', 'uni...","[male, 16, Student, Libra]",hate chosing hate chose university hate chos...
2,"[' ', 'book', 'page', '824', 'problem', 'find'...","[female, 15, Student, Virgo]",book page 824 problem find little boring kin...
3,"[' ', 'isp', 'real', 'aggressive', 'spam', 'fi...","[female, 48, indUnk, Pisces]",isp real aggressive spam filtering spam anym...
4,"[' ', '...', 'realize', 'scent', 'picture', '2...","[male, 27, Technology, Aquarius]",... realize scent picture 2:30 morning ........


Converting tokens back to string since tf-idf accepts only strings

In [32]:
#blog_temp = blog_normalized.head(70000)

In [33]:
from ast import literal_eval

def form_text(text):
    convert_to_array = literal_eval(text)
    combined_text = ' '.join(convert_to_array)
    #combined_text = ' '.join(text)
    return combined_text
blog_normalized['normalized_text_merged'] = blog_normalized['normalized_text'].apply(form_text)

In [39]:
blog_normalized.head()

Unnamed: 0,normalized_text,legend,normalized_text_merged
0,"[' ', 'week', 'thursday', 'father', 'undergo',...","[female, 17, indUnk, Sagittarius]",week thursday father undergo 2 operation fix...
1,"[' ', 'hate', 'chosing', 'hate', 'chose', 'uni...","[male, 16, Student, Libra]",hate chosing hate chose university hate chos...
2,"[' ', 'book', 'page', '824', 'problem', 'find'...","[female, 15, Student, Virgo]",book page 824 problem find little boring kin...
3,"[' ', 'isp', 'real', 'aggressive', 'spam', 'fi...","[female, 48, indUnk, Pisces]",isp real aggressive spam filtering spam anym...
4,"[' ', '...', 'realize', 'scent', 'picture', '2...","[male, 27, Technology, Aquarius]",... realize scent picture 2:30 morning ........


In [40]:
blog_normalized.shape

(509055, 3)

In [58]:
blog_normalized.to_csv("blog_normalized2.csv") #backup after all preprocessing

Working with subset of rows since running out of memory while trying to run with 5 lakh rows

In [None]:
#for idx,name in enumerate(df['Surface'].value_counts().index.tolist()):

Vectorizing with Count Vectorizer

In [68]:
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
vectorizer=CountVectorizer(binary=True, ngram_range=(1,2))

In [None]:
X = blog_normalized.normalized_text_merged

In [70]:
X_vector = vectorizer.fit_transform(X)

Train Test Split

In [71]:
from sklearn.model_selection import train_test_split

X = X_vector
y = Y

X_train, X_test, y_train, y_test =train_test_split(X,y, random_state=42,
                                                   test_size = 0.2,
                                                   #stratify=y,
                                                  shuffle = True)

In [72]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((407244, 14269177), (101811, 14269177), (407244, 45), (101811, 45))

In [73]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [76]:
model=LogisticRegression(solver='lbfgs')
model=OneVsRestClassifier(model)

In [77]:
model.fit(X_train, y_train)





OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

Train accuracy to check model health

In [78]:
y_pred = model.predict(X_train)

In [79]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_evaluation_scores(y_train, y_pred):
    print('Accuracy score: ', accuracy_score(y_train, y_pred))
    print('F1 score: ', f1_score(y_train, y_pred, average='micro'))
    print('Average precision score: ', average_precision_score(y_train, y_pred, average='micro'))
    print('Average recall score: ', recall_score(y_train, y_pred, average='micro'))

In [80]:
print_evaluation_scores(y_train, y_pred)

Accuracy score:  0.7288971722112542
F1 score:  0.8961455197625462
Average precision score:  0.821791944183972
Average recall score:  0.8361719264126666


Test accuracy

In [81]:
y_pred1 = model.predict(X_test)

In [82]:
def print_evaluation_scores1(y_test, y_pred1):
    print('Accuracy score: ', accuracy_score(y_test, y_pred1))
    print('F1 score: ', f1_score(y_test, y_pred1, average='micro'))
    print('Average precision score: ', average_precision_score(y_test, y_pred1, average='micro'))
    print('Average recall score: ', recall_score(y_test, y_pred1, average='micro'))

In [84]:
print_evaluation_scores1(y_test, y_pred1)

Accuracy score:  0.00015715394210841658
F1 score:  0.2727084507137863
Average precision score:  0.15844678800010453
Average recall score:  0.19729449666538978


In [85]:
import pickle
import os

pickle.dump(
        model,
        open(os.path.join(
            '.', 'blog-legend-countvect1.model'),
            'wb'
        )
    )

In [86]:
model2 = open(os.path.join(
            '.', 'blog-legend-countvect1.model'),
            'rb')

tf-idf

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.decomposition import TruncatedSVD
# from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


# model_logreg = Pipeline([
#                 ('tfidf', TfidfVectorizer(
#                                           min_df=3,
#                                           max_features=None,
#                                           strip_accents='unicode',
#                                           analyzer='word',
#                                           ngram_range=(1, 2),
#                                           use_idf=1,
#                                           smooth_idf=1,
#                                           sublinear_tf=1
#                                           )),
#                 ('svd',TruncatedSVD(n_components=500)),
#                 ('clf', LogisticRegression(multi_class='multinomial', solver='lbfgs')),
#                ])
# model_logreg.fit(X_train, y_train)

In [None]:
# import pickle
# pickle.dump(
#         model_logreg,
#         open(os.path.join(
#             '.', 'blog-legend.model'),
#             'wb'
#         )
#     )