# Modeling

In [30]:
import pandas as pd
import acquire
import prepare
from sklearn.feature_extraction.text import TfidfVectorizer
import re


Acquire data

In [119]:
raw_news = acquire.acquire_news_articles()
raw_news.head()

Unnamed: 0,title,content,category
0,RBI keeps repo rate unchanged at 5.15% after f...,The RBI on Thursday kept the repo rate unchang...,business
1,P Chidambaram walks out of Tihar Jail after 10...,Congress leader P Chidambaram walked out of Ti...,business
2,"8, 7, 6.6, 5.8, 5 & 4.5 is the state of econom...",Former Finance Minister P Chidambaram on Thurs...,business
3,Google Co-founders gain $2.3B after Sundar Pic...,Google Co-founders Sergey Brin and Larry Page ...,business
4,Musk says in court his net worth is around $20...,"Tesla and SpaceX founder and CEO Elon Musk, in...",business


In [120]:
news = prepare.basic_clean(raw_news, stem_or_lem = 'stem')
news['content'] = news.content.apply(lambda x: re.sub(r'(^|\s).{,2}(\s|$)', ' ', x))
news

Unnamed: 0,title,content,category
0,rbi keep repo rate unchang five cut thi year,rbi thursday kept repo rate unchang five cut t...,busi
1,p chidambaram walk tihar jail day,congress leader chidambaram walk tihar jail we...,busi
2,state economi chidambaram gdp growth,former financ minist chidambaram thursday said...,busi
3,googl cofound gain b sundar pichai made alphab...,googl cofound sergey brin larri page combin bi...,busi
4,musk say court hi net worth around bn much cash,tesla spacex founder ceo elon musk ongo defam ...,busi
...,...,...,...
95,priyanka chopra receiv unicef ' danni kay huma...,actress priyanka chopra unicef goodwil ambassa...,entertain
96,' imagin live life fear safeti shibani,singer actress shibani dandekar recal incid on...,entertain
97,check filmi ethic befor lectur shoojit sircar ...,director shoojit sircar wednesday took twitter...,entertain
98,' feel butterfli stomach deepika choos script,deepika padukon said continu choos script way ...,entertain


In [121]:
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(news['content'])
pd.DataFrame(tfidfs.todense(), columns = tfidf.get_feature_names()).head()
#pd.DataFrame(tfidfs.todense(), columns = tfidf.get_feature_names()).join(news.category, rsuffix='_y')

Unnamed: 0,aaj,aaryan,abdul,abl,abus,acceler,accept,access,accommod,account,...,xiaomi,year,yearold,york,young,younger,youto,yuvraj,zlatan,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.093385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.127029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.133433,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

X = pd.DataFrame(tfidfs.todense(), columns = tfidf.get_feature_names())
y = news.category

X_train, X_test, y_train, y_test = train_test_split(X,y,stratify = y)

train = pd.DataFrame(dict(actual = y_train))
test = pd.DataFrame(dict(actual = y_test))

lreg = DecisionTreeClassifier(max_depth=4).fit(X_train,y_train)

train['predicted'] = lreg.predict(X_train)
test['predicted']= lreg.predict(X_test)

In [123]:
print(accuracy_score(train.actual, train.predicted))
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(classification_report(train.actual, train.predicted))

0.6266666666666667
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
              precision    recall  f1-score   support

        busi       1.00      0.63      0.77        19
   entertain       1.00      0.47      0.64        19
       sport       0.40      1.00      0.58        19
   technolog       1.00      0.39      0.56        18

    accuracy                           0.63        75
   macro avg       0.85      0.62      0.64        75
weighted avg       0.85      0.63      0.64        75



In [124]:
print(accuracy_score(test.actual, test.predicted))
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(classification_report(test.actual, test.predicted))

0.4
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
              precision    recall  f1-score   support

        busi       0.00      0.00      0.00         6
   entertain       1.00      0.50      0.67         6
       sport       0.35      1.00      0.52         6
   technolog       0.33      0.14      0.20         7

    accuracy                           0.40        25
   macro avg       0.42      0.41      0.35        25
weighted avg       0.42      0.40      0.34        25



In [125]:
pd.crosstab(train.actual, train.predicted)

predicted,busi,entertain,sport,technolog
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
busi,12,0,7,0
entertain,0,9,10,0
sport,0,0,19,0
technolog,0,0,11,7


In [126]:
pd.DataFrame(train.actual.value_counts()).join(train.predicted.value_counts())

Unnamed: 0,actual,predicted
busi,19,12
entertain,19,9
sport,19,47
technolog,18,7
