### Load the dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
try:
    from nltk.corpus import stopwords
except:
    nltk.download('stopwords')
    from nltk.corpus import stopwords

In [2]:
blog_data = pd.read_csv('blogtext.csv')

In [3]:
blog_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
id        681284 non-null int64
gender    681284 non-null object
age       681284 non-null int64
topic     681284 non-null object
sign      681284 non-null object
date      681284 non-null object
text      681284 non-null object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [4]:
blog_data.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [5]:
data = blog_data.head(5000)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
id        5000 non-null int64
gender    5000 non-null object
age       5000 non-null int64
topic     5000 non-null object
sign      5000 non-null object
date      5000 non-null object
text      5000 non-null object
dtypes: int64(2), object(5)
memory usage: 273.6+ KB


### Preprocess rows of the “text” column

In [6]:
# Convert text to lowercase
data = data.applymap(lambda s: s.lower() if type(s) == str else s)
# Remove unwanted characters
data = data.applymap(lambda s: re.sub('[^0-9a-z #+_]', " ", s) if type(s) == str else s)
# Remove unwanted spaces
data = data.applymap(lambda s: s.strip() if type(s) == str else s)
# Remove stopwords
stopwords = stopwords.words('english')
for sent in data['text']:
    sent = [wrd for wrd in sent if wrd not in stopwords]

In [7]:
blog_data['text'][0]

'           Info has been found (+/- 100 pages, and 4.5 MB of .pdf files) Now i have to wait untill our team leader has processed it and learns html.         '

In [8]:
data['text'][0]

'info has been found  +   100 pages  and 4 5 mb of  pdf files  now i have to wait untill our team leader has processed it and learns html'

### Merging the labels together

In [9]:
data = data.assign(labels=data.gender.astype(str) + ', ' +
                   data.age.astype(str) + ', ' + data.topic.astype(str) +
                   ', ' + data.sign.astype(str))

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
id        5000 non-null int64
gender    5000 non-null object
age       5000 non-null int64
topic     5000 non-null object
sign      5000 non-null object
date      5000 non-null object
text      5000 non-null object
labels    5000 non-null object
dtypes: int64(2), object(6)
memory usage: 312.6+ KB


In [11]:
data.drop(['id', 'date', 'age', 'gender', 'sign', 'topic'],
          axis=1,
          inplace=True)

# data1 = data.assign(labels = [data.gender, data.age, data.topic, data.sign])
# data1 = data.assign(labels=lambda x: pd.array([x.gender, x.age, x.topic, x.sign]))
# data["labels"] = [data['gender'], data['age'], data['topic'], data['sign']]

In [13]:
data['labels'] = [lbl.split(',') for lbl in data['labels']]

In [14]:
data.head(5)

Unnamed: 0,text,labels
0,info has been found + 100 pages and 4 5 mb...,"[male, 15, student, leo]"
1,these are the team members drewes van der l...,"[male, 15, student, leo]"
2,in het kader van kernfusie op aarde maak je ...,"[male, 15, student, leo]"
3,testing testing,"[male, 15, student, leo]"
4,thanks to yahoo s toolbar i can now capture ...,"[male, 33, investmentbanking, aquarius]"


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
text      5000 non-null object
labels    5000 non-null object
dtypes: object(2)
memory usage: 78.2+ KB


### Separate features and labels, and split the data into training and testing

In [16]:
X_train, X_test, y_train, y_test = train_test_split(data['text'],
                                                    data['labels'],
                                                    test_size=0.3,
                                                    random_state=7)

In [17]:
print(
    f"Shape of the Train data: {X_train.shape}, Target Label shape: {y_train.shape}"
)

Shape of the Train data: (3500,), Target Label shape: (3500,)


In [18]:
print(
    f"Shape of the Test data: {X_test.shape}, Target Label shape: {y_test.shape}"
)

Shape of the Test data: (1500,), Target Label shape: (1500,)


### Vectorize the features

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(min_df=2, ngram_range=(1, 2), stop_words=stopwords)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [20]:
X_train.shape

(3500, 33699)

In [21]:
X_train[0]

<1x33699 sparse matrix of type '<class 'numpy.int64'>'
	with 137 stored elements in Compressed Sparse Row format>

In [22]:
vect.vocabulary_

{'47': 403,
 'hours': 13662,
 'bio': 3230,
 'doesnt': 7645,
 'investigate': 14497,
 'isnt': 14568,
 '48': 404,
 'dont': 7722,
 'need': 19433,
 'instead': 14333,
 'going': 11725,
 'go': 11514,
 'ask': 2077,
 'people': 21478,
 'things': 28891,
 'tell': 28571,
 'arnt': 1940,
 'investigating': 14499,
 'like': 16209,
 'get': 11069,
 'real': 23506,
 'answers': 1595,
 'extra': 9320,
 'hour': 13646,
 'spent': 27067,
 'eating': 8145,
 'cuz': 6493,
 'food': 10349,
 'episode': 8664,
 'mysterious': 19107,
 'ghetto': 11342,
 'lexington': 16080,
 'somewhere': 26800,
 'legendary': 15935,
 'town': 30087,
 'ohio': 20357,
 'lets': 16046,
 'move': 18791,
 'danielle': 6623,
 'house': 13685,
 'believes': 2978,
 'lives': 16766,
 'danille': 6627,
 'yes': 33504,
 'believe': 2947,
 'every': 8972,
 'one': 20507,
 'nobody': 19993,
 'mr': 18856,
 'johnreed': 14799,
 'thinks': 29183,
 'ave': 2308,
 'john': 14763,
 'send': 25624,
 'gave': 10972,
 'ur': 30832,
 'name': 19131,
 'thats': 28752,
 'sounds': 26937,
 'fun

### Count labels

In [23]:
count_labels = {}
for lbl in data['labels']:
    for fea in lbl:
        if fea in count_labels:
            count_labels[fea] += 1
        else:
            count_labels[fea] = 1

In [24]:
count_labels

{'male': 3294,
 ' 15': 339,
 ' student': 569,
 ' leo': 190,
 ' 33': 101,
 ' investmentbanking': 70,
 ' aquarius': 329,
 'female': 1706,
 ' 14': 170,
 ' indunk': 1381,
 ' aries': 2483,
 ' 25': 268,
 ' capricorn': 84,
 ' 17': 331,
 ' gemini': 86,
 ' 23': 137,
 ' non profit': 47,
 ' cancer': 94,
 ' banking': 16,
 ' 37': 19,
 ' sagittarius': 704,
 ' 26': 96,
 ' 24': 353,
 ' scorpio': 408,
 ' 27': 86,
 ' education': 118,
 ' 45': 14,
 ' engineering': 119,
 ' libra': 414,
 ' science': 33,
 ' 34': 540,
 ' 41': 14,
 ' communications media': 61,
 ' businessservices': 87,
 ' sports recreation': 75,
 ' virgo': 41,
 ' taurus': 100,
 ' arts': 31,
 ' pisces': 67,
 ' 44': 3,
 ' 16': 67,
 ' internet': 20,
 ' museums libraries': 2,
 ' accounting': 2,
 ' 39': 79,
 ' 35': 2307,
 ' technology': 2332,
 ' 36': 60,
 ' law': 3,
 ' 46': 7,
 ' consulting': 16,
 ' automotive': 14,
 ' 42': 9,
 ' religion': 4}

### Transforming the labels

In [25]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [26]:
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [27]:
y_train[0]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1])

### Choosing a classifier

In [30]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(solver='lbfgs') 
clf = OneVsRestClassifier(lr_clf)

### Fitting the classifier to make predictions and printing the final accuracy metrics

In [31]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)



In [32]:
from sklearn import metrics
metrics.classification_report(y_test, y_pred)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


'              precision    recall  f1-score   support\n\n           0       0.60      0.11      0.19        54\n           1       0.80      0.39      0.53       109\n           2       0.80      0.20      0.32        20\n           3       0.78      0.37      0.50        97\n           4       0.25      0.03      0.05        37\n           5       0.78      0.33      0.46       115\n           6       0.57      0.15      0.24        78\n           7       1.00      0.07      0.14        27\n           8       0.83      0.24      0.37        21\n           9       1.00      0.39      0.56        36\n          10       0.96      0.72      0.82       174\n          11       0.82      0.90      0.86       668\n          12       0.40      0.13      0.20        15\n          13       0.00      0.00      0.00         6\n          14       0.50      0.04      0.08        24\n          15       0.00      0.00      0.00         9\n          16       0.00      0.00      0.00         3\n       

In [35]:
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print(f"F1 score-Micro: {metrics.f1_score(y_test, y_pred, average='micro')}")
print(f"F1 score-Macro: {metrics.f1_score(y_test, y_pred, average='macro')}")
print(f"precision: {metrics.precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {metrics.recall_score(y_test, y_pred,average='weighted')}")

Accuracy: 0.522
F1 score-Micro: 0.7443602155447986
F1 score-Macro: 0.31003263425208866
precision: 0.7967365656128985
Recall: 0.6791666666666667


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### Printing the true label and predicted label for five values

In [36]:
Y_pred_inv = mlb.inverse_transform(y_pred)
#_test_trans_inv =  mlb.inverse_transform(y_test) # inverse transforming original test label data

In [37]:
Y_pred_inv[0:5]

[(' 35', ' aries', ' technology', 'male'),
 (' 24', ' businessservices', ' cancer', 'male'),
 (' 35', ' aries', ' technology', 'male'),
 (' 34', ' indunk', ' sagittarius', 'female'),
 (' 35', ' aries', ' technology', 'male')]

In [38]:
y_test=mlb.inverse_transform(y_test)

In [39]:
y_test[0:5]

[(' 35', ' aries', ' technology', 'male'),
 (' 24', ' businessservices', ' cancer', 'male'),
 (' 35', ' aries', ' technology', 'male'),
 (' 34', ' indunk', ' sagittarius', 'female'),
 (' 35', ' aries', ' technology', 'male')]

## Summary of the project
1. Understanding the dataset and preprocessing the text data
2. Preparing the data for multi label classification
3. Creating a count dict for summarization of the classes
4. Using MultiLabelBinarizer for transformin the multiple labels to be predicted and inverse the predictions to the actual values
5. Using one vs rest classifier with Logistic regression classifier as base
