In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [None]:
df1 = pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')
df = pd.concat([df1,df2],ignore_index=True)

In [None]:
df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [None]:
df.isna().sum()

Class Index    0
Title          0
Description    0
dtype: int64

In [None]:
df.duplicated().sum()

np.int64(0)

Splitting

In [None]:
x = df['Description']
y = df['Class Index']

Train and Test

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=4)

Text Preprocessing

In [None]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def text_preprocessing(text):
    v1 = "".join([i for i in text.lower() if i not in punctuation])
    v2 = [lemmatizer.lemmatize(i,'v') for i in v1.split() if i not in stop]
    return  " ".join([lemmatizer.lemmatize(i,'r') for i in v2 if i not in stop])

Sampling

In [None]:
y_train.value_counts()

Class Index
2    22378
1    22331
4    22308
3    22303
Name: count, dtype: int64

Vectorization and  Modeling

In [None]:
models = [MultinomialNB(),LinearSVC(),DecisionTreeClassifier()]

for modell in models:
    print(modell)

    model = Pipeline([('vectorizer',TfidfVectorizer(preprocessor=text_preprocessing)),
                    ('model',modell)])

    model.fit(x_train,y_train)

    y_pred = model.predict(x_test)

    print(classification_report(y_test,y_pred))
    # print(cross_val_score(model,x_train,y_train,cv=5).mean())
    print("_____"*20)

MultinomialNB()
              precision    recall  f1-score   support

           1       0.90      0.89      0.90      9569
           2       0.94      0.98      0.96      9522
           3       0.86      0.87      0.86      9597
           4       0.88      0.85      0.87      9592

    accuracy                           0.90     38280
   macro avg       0.90      0.90      0.90     38280
weighted avg       0.90      0.90      0.90     38280

____________________________________________________________________________________________________
LinearSVC()
              precision    recall  f1-score   support

           1       0.92      0.89      0.91      9569
           2       0.95      0.98      0.96      9522
           3       0.88      0.88      0.88      9597
           4       0.89      0.89      0.89      9592

    accuracy                           0.91     38280
   macro avg       0.91      0.91      0.91     38280
weighted avg       0.91      0.91      0.91     38280

_

In [None]:
print(cross_val_score(LinearSVC(),x_train,y_train,cv=5).mean())



Final Model

In [None]:
model = Pipeline([('vectorizer',TfidfVectorizer(preprocessor=text_preprocessing)),
                  ('model',LinearSVC())])

In [None]:
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.92      0.89      0.91      9569
           2       0.95      0.98      0.96      9522
           3       0.88      0.88      0.88      9597
           4       0.89      0.89      0.89      9592

    accuracy                           0.91     38280
   macro avg       0.91      0.91      0.91     38280
weighted avg       0.91      0.91      0.91     38280



Hyperparameter Tuning

In [None]:

params = {'model__C': [0.01, 0.1, 1, 10],'model__loss': ['hinge', 'squared_hinge']}

grid = RandomizedSearchCV(model,params,cv=5,n_iter=10)

grid.fit(x_train,y_train)


In [None]:
grid.best_score_

np.float64(0.9074003582624274)

In [None]:
# final model

final_model = grid.best_estimator_

In [None]:
import pickle

data = {'model':final_model}

with open('news.pkl','wb') as ob1:
    pickle.dump(data,ob1)