# Modeling

In [30]:
import pandas as pd
import acquire
import prepare
from sklearn.feature_extraction.text import TfidfVectorizer
import re


Acquire data

In [44]:
raw_news = acquire.acquire_news_articles()
raw_news.head()

Unnamed: 0,title,content,category
0,RBI keeps repo rate unchanged at 5.15% after f...,The RBI on Thursday kept the repo rate unchang...,business
1,P Chidambaram walks out of Tihar Jail after 10...,Congress leader P Chidambaram walked out of Ti...,business
2,"8, 7, 6.6, 5.8, 5 & 4.5 is the state of econom...",Former Finance Minister P Chidambaram on Thurs...,business
3,Google Co-founders gain $2.3B after Sundar Pic...,Google Co-founders Sergey Brin and Larry Page ...,business
4,Musk says in court his net worth is around $20...,"Tesla and SpaceX founder and CEO Elon Musk, in...",business


In [47]:
news = prepare.basic_clean(raw_news)
news['title'] = news.title.apply(lambda x: re.sub(r'(^|\s).{,2}(\s|$)', ' ', x))
news

Unnamed: 0,title,content,category
0,rbi keep repo rate unchanged five cut year,rbi thursday kept repo rate unchanged five cut...,business
1,chidambaram walk tihar jail day,congress leader p chidambaram walked tihar jai...,business
2,state economy chidambaram gdp growth,former finance minister p chidambaram thursday...,business
3,google cofounder gain sundar pichai made alpha...,google cofounder sergey brin larry page added ...,business
4,musk say court net worth around much cash,tesla spacex founder ceo elon musk ongoing def...,business
...,...,...,...
95,priyanka chopra receives unicef danny kaye hum...,actress priyanka chopra unicef ' goodwill amba...,entertainment
96,imagine living life fear safety shibani,singer actress shibani dandekar recalled incid...,entertainment
97,check filmy ethic lecturing shoojit sircar bol...,director shoojit sircar wednesday took twitter...,entertainment
98,feel butterfly stomach deepika choosing script,deepika padukone ha said continues choose scri...,entertainment


In [75]:
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(news['title'])
pd.DataFrame(tfidfs.todense(), columns = tfidf.get_feature_names()).head()
#pd.DataFrame(tfidfs.todense(), columns = tfidf.get_feature_names()).join(news.category, rsuffix='_y')

Unnamed: 0,abdul,able,accessed,accused,act,acting,actor,actress,adult,african,...,worst,worth,would,wsj,xiaomi,year,young,youngster,yrold,zlatan
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.371582,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.367056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

X = pd.DataFrame(tfidfs.todense(), columns = tfidf.get_feature_names())
y = news.category

X_train, X_test, y_train, y_test = train_test_split(X,y,stratify = y)

train = pd.DataFrame(dict(actual = y_train))
test = pd.DataFrame(dict(actual = y_test))

lreg = DecisionTreeClassifier(max_depth=4).fit(X_train,y_train)

train['predicted'] = lreg.predict(X_train)
test['predicted']= lreg.predict(X_test)

In [90]:
print(classification_report(train.actual, train.predicted))

               precision    recall  f1-score   support

     business       0.80      0.22      0.35        18
entertainment       0.31      1.00      0.47        19
        sport       1.00      0.16      0.27        19
   technology       1.00      0.32      0.48        19

     accuracy                           0.43        75
    macro avg       0.78      0.42      0.39        75
 weighted avg       0.78      0.43      0.39        75



In [92]:
print(accuracy_score(test.actual, test.predicted))
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(classification_report(test.actual, test.predicted))

0.28
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
               precision    recall  f1-score   support

     business       1.00      0.14      0.25         7
entertainment       0.25      1.00      0.40         6
        sport       0.00      0.00      0.00         6
   technology       0.00      0.00      0.00         6

     accuracy                           0.28        25
    macro avg       0.31      0.29      0.16        25
 weighted avg       0.34      0.28      0.17        25



  'precision', 'predicted', average, warn_for)


In [95]:
pd.crosstab(train.actual, train.predicted)

predicted,business,entertainment,sport,technology
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
business,4,14,0,0
entertainment,0,19,0,0
sport,0,16,3,0
technology,1,12,0,6


In [99]:
pd.DataFrame(train.actual.value_counts()).join(train.predicted.value_counts())

Unnamed: 0,actual,predicted
technology,19,6
sport,19,3
entertainment,19,61
business,18,5
