Author:     Maple FENG  
Data:       2021-10-19  
Document:   tfidf.ipynb

In [9]:
import string
import pandas as pd
import numpy as np
import re

from langdetect import detect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [10]:
train_all = pd.read_csv('train1020.csv')
test_all = pd.read_csv('test1020.csv')
print(train_all.shape)
print(test_all.shape)

(2080, 13)
(890, 13)


In [11]:
train_filtered = train_all[~train_all['description'].isna()]
test_filtered = test_all[~test_all['description'].isna()]
print(train_filtered.shape)
print(test_filtered.shape)

(2029, 13)
(877, 13)


In [12]:
train_en = train_filtered[(train_filtered['description'].apply(lambda x: detect(x)) == 'en') & 
                          (train_filtered['title'].apply(lambda x: detect(x)) == 'en')]
test_en = test_filtered[(test_filtered['description'].apply(lambda x: detect(x)) == 'en') & 
                        (test_filtered['title'].apply(lambda x: detect(x)) == 'en')]
print(train_en.shape)
print(test_en.shape)

(1831, 13)
(803, 13)


In [13]:
train = train_en['title'] + ' ' + train_en['description']
test = test_en['title'] + ' ' + test_en['description']
Y_train = train_en['category']
Y_test = test_en['category']

In [51]:
print('Training Set\n')
print(Y_train.value_counts())

Training Set

Business                            434
Computer Science                    259
Health                              241
Data Science                        201
Social Sciences                     173
Physical Science and Engineering    171
Arts and Humanities                 151
Information Technology               81
Language Learning                    53
Personal Development                 45
Math and Logic                       22
Name: category, dtype: int64


In [52]:
print('Test Set\n')
print(Y_test.value_counts())

Test Set

Business                            184
Health                              115
Data Science                         98
Computer Science                     95
Physical Science and Engineering     92
Social Sciences                      64
Arts and Humanities                  54
Language Learning                    36
Information Technology               34
Personal Development                 23
Math and Logic                        8
Name: category, dtype: int64


In [16]:
mystopwords = stopwords.words('english') + ['\'s', 'university', 'universities', 'student', 'students', 'mooc',
                                            'one', 'two', 'three', 'introduction', 'use', 'don\'t', 'doesn\'t', '\'m',
                                            '--', '...', '10,000', '16,000,000,000', '1990s', '2030.', '20th', '21st', 
                                            '2d', '360‚ñ¢', '3d', '4-week', '500,000', '8.', '``', '\'\'', '‚äì', '‚äî','‚äù',
                                            '\'ll', 'also', 'would', 'we\'ll', 'via', 'upon', ]
garble = re.compile(r'‚äô')
punctuations = string.punctuation + "’“”" 
steammer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

In [17]:
def preprocess(text):
    tokens = word_tokenize(text.lower().strip())
    tokens = [garble.sub('\'', token) for token in tokens]
    tokens = [token for token in tokens if token not in mystopwords]
    tokens = [token for token in tokens if token not in punctuations]
    tokens = [token for token in tokens if not token.isnumeric()]
    tokens = [token for token in tokens if len(token) > 1]
    # tokens = [lemmatizer.lemmatize(token).strip() for token in tokens]
    tokens = [steammer.stem(token).strip() for token in tokens]
    return tokens

vectorizer = TfidfVectorizer(tokenizer=preprocess, decode_error='ignore', max_df=0.7, min_df=15)

In [18]:
X_train = vectorizer.fit_transform(train)
X_train

<1831x847 sparse matrix of type '<class 'numpy.float64'>'
	with 44588 stored elements in Compressed Sparse Row format>

In [19]:
X_train.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.15075365,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [20]:
vectorizer.get_feature_names_out()[:1000]

array(['abil', 'abl', 'academ', 'acceler', 'access', 'accord', 'account',
       'achiev', 'acquir', 'across', 'act', 'action', 'activ', 'actual',
       'ad', 'adapt', 'addit', 'address', 'administr', 'adopt', 'advanc',
       'affect', 'age', 'ai', 'aim', 'algorithm', 'allow', 'almost',
       'along', 'alreadi', 'altern', 'alway', 'american', 'among',
       'analys', 'analysi', 'analyt', 'analyz', 'anim', 'anoth', 'answer',
       'anyon', 'api', 'app', 'appli', 'applic', 'appreci', 'approach',
       'appropri', 'architectur', 'area', 'around', 'art', 'artifici',
       'artist', 'ask', 'aspect', 'assess', 'asset', 'assign', 'associ',
       'assum', 'attent', 'audienc', 'author', 'autom', 'avail', 'avoid',
       'aw', 'awar', 'background', 'balanc', 'base', 'basi', 'basic',
       'becom', 'begin', 'behavior', 'behind', 'benefit', 'best',
       'better', 'beyond', 'big', 'bioinformat', 'biolog', 'blockchain',
       'board', 'bodi', 'book', 'boulder', 'brand', 'bring', 'broad',

In [21]:
X_test = vectorizer.transform(test)

In [22]:
X_test

<803x847 sparse matrix of type '<class 'numpy.float64'>'
	with 19593 stored elements in Compressed Sparse Row format>

In [27]:
model = MultinomialNB()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(Y_pred, Y_test)}\n')
print(classification_report(Y_pred, Y_test))

Accuracy: 0.6537982565379825

                                  precision    recall  f1-score   support

             Arts and Humanities       0.56      0.75      0.64        40
                        Business       0.96      0.60      0.74       293
                Computer Science       0.80      0.50      0.62       151
                    Data Science       0.71      0.72      0.72        97
                          Health       0.71      0.73      0.72       113
          Information Technology       0.03      0.50      0.06         2
               Language Learning       0.36      1.00      0.53        13
                  Math and Logic       0.00      0.00      0.00         0
            Personal Development       0.00      0.00      0.00         0
Physical Science and Engineering       0.49      0.87      0.62        52
                 Social Sciences       0.48      0.74      0.58        42

                        accuracy                           0.65       803
      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
model = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=80, tol=None)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(Y_pred, Y_test)}\n')
print(classification_report(Y_pred, Y_test))

Accuracy: 0.7023661270236613

                                  precision    recall  f1-score   support

             Arts and Humanities       0.72      0.64      0.68        61
                        Business       0.90      0.70      0.79       236
                Computer Science       0.78      0.69      0.73       108
                    Data Science       0.77      0.69      0.72       109
                          Health       0.73      0.71      0.72       119
          Information Technology       0.38      0.65      0.48        20
               Language Learning       0.72      0.96      0.83        27
                  Math and Logic       0.25      0.50      0.33         4
            Personal Development       0.00      0.00      0.00         1
Physical Science and Engineering       0.59      0.79      0.68        68
                 Social Sciences       0.48      0.62      0.54        50

                        accuracy                           0.70       803
      

In [29]:
model = LogisticRegression(C=1e5)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(Y_pred, Y_test)}\n')
print(classification_report(Y_pred, Y_test))

Accuracy: 0.6650062266500623

                                  precision    recall  f1-score   support

             Arts and Humanities       0.67      0.61      0.64        59
                        Business       0.80      0.71      0.75       207
                Computer Science       0.74      0.65      0.69       107
                    Data Science       0.67      0.69      0.68        95
                          Health       0.70      0.68      0.69       117
          Information Technology       0.56      0.58      0.57        33
               Language Learning       0.72      1.00      0.84        26
                  Math and Logic       0.25      0.67      0.36         3
            Personal Development       0.04      0.11      0.06         9
Physical Science and Engineering       0.53      0.60      0.57        81
                 Social Sciences       0.59      0.58      0.58        66

                        accuracy                           0.67       803
      

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
model = RandomForestClassifier(n_estimators=300, max_depth=70, random_state=1)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(Y_pred, Y_test)}\n')
print(classification_report(Y_pred, Y_test))

Accuracy: 0.684931506849315

                                  precision    recall  f1-score   support

             Arts and Humanities       0.70      0.72      0.71        53
                        Business       0.91      0.65      0.76       255
                Computer Science       0.73      0.61      0.66       114
                    Data Science       0.78      0.69      0.73       110
                          Health       0.75      0.70      0.73       122
          Information Technology       0.38      0.72      0.50        18
               Language Learning       0.69      0.93      0.79        27
                  Math and Logic       0.25      0.67      0.36         3
            Personal Development       0.00      0.00      0.00         0
Physical Science and Engineering       0.48      0.77      0.59        57
                 Social Sciences       0.47      0.68      0.56        44

                        accuracy                           0.68       803
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
