## Submitted by: Aaryan Verma
## For Bipolar Factory internship Assessment.
### Date of Submission: 23rd April 2020

In [1]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article  
import csv 
import pandas as pd
import numpy as np

# Crawling News from Times of India Website

In [2]:
url = "https://timesofindia.indiatimes.com/world"
r = requests.get(url)

In [3]:
soup = BeautifulSoup(r.content, 'html5lib') 
table = soup.findAll('a', attrs = {'class':'w_img'}) 

In [4]:
news=[]
for row in table: 
    if not row['href'].startswith('http'):
        news.append('https://timesofindia.indiatimes.com'+row['href'])

In [5]:
df=[]
for i in news:
    article = Article(i, language="en")
    article.download() 
    article.parse() 
    article.nlp() 
    data={}
    data['Title']=article.title
    data['Text']=article.text
    data['Summary']=article.summary
    data['Keywords']=article.keywords
    df.append(data)

In [6]:
dataset=pd.DataFrame(df)
dataset.head()

Unnamed: 0,Keywords,Summary,Text,Title
0,"[times, envoy, era, bully, chinese, coronaviru...",Order NowLONDON: China did not cover up the no...,People wear face masks to protect against the ...,"There was no China cover-up of coronavirus, Ch..."
1,"[workplaces, turning, transport, average, lock...","Apr 22, 2020, 04:03PM ISTSource: APIndia's ext...","Apr 22, 2020, 04:03PM IST\n\nSource: AP\n\nInd...",Covid-19: Lockdown reveals fresh and clean air...
2,"[launch, tensions, surprise, launched, wider, ...","Apr 22, 2020, 03:58PM ISTSource: APIran's Revo...","Apr 22, 2020, 03:58PM IST\n\nSource: AP\n\nIra...",Iran's Guard says it launched satellite
3,"[animals, wild, shows, pollution, air, wales, ...",As people across the globe stay home to stop t...,"Apr 22, 2020, 04:04PM IST\n\nSource: AP\n\nAn ...",Covid-19: Virus shutdown shows human impact on...
4,"[san, sure, stolen, talermo, white, telling, s...","Apr 22, 2020, 03:57PM ISTSource: APFour months...","Apr 22, 2020, 03:57PM IST\n\nSource: AP\n\nFou...",San Francisco: Dog stolen in December has joyo...


# Model for predicting virality of news

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [8]:
FILEPATH="E:\Machine_Learning_Models\OnlineNewsPopularity\OnlineNewsPopularity.csv"

In [9]:
def clean_cols(data):
    """Clean the column names by stripping and lowercase."""
    clean_col_map = {x: x.lower().strip() for x in list(data)}
    return data.rename(index=str, columns=clean_col_map)

def TrainTestSplit(X, Y, R=0, test_size=0.2):
    """Easy Train Test Split call."""
    return train_test_split(X, Y, test_size=test_size, random_state=R)

#### I removed these features below because I didn't know how to calculate these features for the crawled news text.

In [10]:
full_data = clean_cols(pd.read_csv(FILEPATH))
train_set, test_set = train_test_split(full_data, test_size=0.20, random_state=42)

x_train = train_set.drop(['url','shares', 'timedelta', 'lda_00','lda_01','lda_02','lda_03','lda_04','num_self_hrefs', 'kw_min_min', 'kw_max_min', 'kw_avg_min','kw_min_max','kw_max_max','kw_avg_max','kw_min_avg','kw_max_avg','kw_avg_avg','self_reference_min_shares','self_reference_max_shares','self_reference_avg_sharess','rate_positive_words','rate_negative_words','abs_title_subjectivity','abs_title_sentiment_polarity'], axis=1)
y_train = train_set['shares']

x_test = test_set.drop(['url','shares', 'timedelta', 'num_self_hrefs', 'kw_min_min', 'kw_max_min', 'kw_avg_min','kw_min_max','kw_max_max','kw_avg_max','kw_min_avg','kw_max_avg','kw_avg_avg','self_reference_min_shares','self_reference_max_shares','self_reference_avg_sharess','rate_positive_words','rate_negative_words','abs_title_subjectivity','abs_title_sentiment_polarity'], axis=1)
y_test = test_set['shares']

In [11]:
clf = RandomForestRegressor(random_state=42)
clf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [12]:
rf_res = pd.DataFrame(clf.predict(x_train),list(y_train))

In [13]:
rf_res.reset_index(level=0, inplace=True)
rf_res_df = rf_res.rename(index=str, columns={"index": "Actual shares", 0: "Predicted shares"})
rf_res_df.head()

Unnamed: 0,Actual shares,Predicted shares
0,16100,11247.11
1,508,882.22
2,1300,1864.97
3,3100,3627.22
4,6900,5415.02


# Converting Crawled News according to Training Set in UCI Dataset

In [14]:
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))

In [15]:
def rate_unique(words):
    words=tokenize(words)
    no_order = list(set(words))
    rate_unique=len(no_order)/len(words)
    return rate_unique

In [16]:
def rate_nonstop(words):
    words=tokenize(words)
    filtered_sentence = [w for w in words if not w in stopwords]
    rate_nonstop=len(filtered_sentence)/len(words)
    no_order = list(set(filtered_sentence))
    rate_unique_nonstop=len(no_order)/len(words)
    return rate_nonstop,rate_unique_nonstop

In [17]:
def avg_token(words):
    words=tokenize(words)
    length=[]
    for i in words:
        length.append(len(i))
    return np.average(length)

In [18]:
from textblob import TextBlob

In [19]:
import datefinder
import datetime  
from datetime import date 
def day(article_text):
    article=article_text
    if len(list(datefinder.find_dates(article)))>0:
        date=str(list(datefinder.find_dates(article))[0])
        date=date.split()
        date=date[0]
        year, month, day = date.split('-')     
        day_name = datetime.date(int(year), int(month), int(day)) 
        return day_name.strftime("%A")
    return "Monday"

In [20]:
def tokenize(text):
    text=text
    return word_tokenize(text)

In [21]:
pos_words=[]
neg_words=[]
def polar(words):
    all_tokens=tokenize(words)
    for i in all_tokens:
        analysis=TextBlob(i)
        polarity=analysis.sentiment.polarity
        if polarity>0:
            pos_words.append(i)
        if polarity<0:
            neg_words.append(i)
    return pos_words,neg_words

In [22]:
def rates(words):
    words=polar(words)
    pos=words[0]
    neg=words[1]
    all_words=words
    global_rate_positive_words=(len(pos)/len(all_words))/100
    global_rate_negative_words=(len(neg)/len(all_words))/100
    pol_pos=[]
    pol_neg=[]
    for i in pos:
        analysis=TextBlob(i)
        pol_pos.append(analysis.sentiment.polarity)
        avg_positive_polarity=analysis.sentiment.polarity
    for j in neg:
        analysis2=TextBlob(j)
        pol_neg.append(analysis2.sentiment.polarity)
        avg_negative_polarity=analysis2.sentiment.polarity
    min_positive_polarity=min(pol_pos)
    max_positive_polarity=max(pol_pos)
    min_negative_polarity=min(pol_neg)
    max_negative_polarity=max(pol_neg)
    avg_positive_polarity=np.average(pol_pos)
    avg_negative_polarity=np.average(pol_neg)
    return global_rate_positive_words,global_rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity


In [23]:
df2=[]
for i in news:
    pred_info={}
    article = Article(i, language="en") # en for English 
    article.download() 
    article.parse()
    analysis=TextBlob(article.text)
    polarity=analysis.sentiment.polarity
    title_analysis=TextBlob(article.title)
    pred_info['text']=article.text
    pred_info['n_tokens_title']=len(tokenize(article.title))
    pred_info['n_tokens_content']=len(tokenize(article.text))
    pred_info['n_unique_tokens']=rate_unique(article.text)
    pred_info['n_non_stop_words']=rate_nonstop(article.text)[0]
    pred_info['n_non_stop_unique_tokens']=rate_nonstop(article.text)[1]
    pred_info['num_hrefs']=article.html.count("https://timesofindia.indiatimes.com")
    pred_info['num_imgs']=len(article.images)
    pred_info['num_videos']=len(article.movies)
    pred_info['average_token_length']=avg_token(article.text)
    pred_info['num_keywords']=len(article.keywords)
    
    if "life-style" in article.url:
        pred_info['data_channel_is_lifestyle']=1
    else:
        pred_info['data_channel_is_lifestyle']=0
    if "etimes" in article.url:
        pred_info['data_channel_is_entertainment']=1
    else:
        pred_info['data_channel_is_entertainment']=0
    if "business" in article.url:
        pred_info['data_channel_is_bus']=1
    else:
        pred_info['data_channel_is_bus']=0
    if "social media" or "facebook" or "whatsapp" in article.text.lower():
        data_channel_is_socmed=1
        data_channel_is_tech=0
        data_channel_is_world=0
    else:
        data_channel_is_socmed=0
    if ("technology" or "tech" in article.text.lower()) or ("technology" or "tech" in article.url):
        data_channel_is_tech=1
        data_channel_is_socmed=0
        data_channel_is_world=0
    else:
        data_channel_is_tech=0
    if "world" in article.url:
        data_channel_is_world=1
        data_channel_is_tech=0
        data_channel_is_socmed=0
    else:
        data_channel_is_world=0
        
    pred_info['data_channel_is_socmed']=data_channel_is_socmed
    pred_info['data_channel_is_tech']=data_channel_is_tech
    pred_info['data_channel_is_world']=data_channel_is_world
    
    if day(i)=="Monday":
        pred_info['weekday_is_monday']=1
    else:
        pred_info['weekday_is_monday']=0
    if day(i)=="Tuesday":
        pred_info['weekday_is_tuesday']=1
    else:
        pred_info['weekday_is_tuesday']=0
    if day(i)=="Wednesday":
        pred_info['weekday_is_wednesday']=1
    else:
        pred_info['weekday_is_wednesday']=0
    if day(i)=="Thursday":
        pred_info['weekday_is_thursday']=1
    else:
        pred_info['weekday_is_thursday']=0
    if day(i)=="Friday":
        pred_info['weekday_is_friday']=1
    else:
        pred_info['weekday_is_friday']=0
    if day(i)=="Saturday":
        pred_info['weekday_is_saturday']=1
        pred_info['is_weekend']=1
    else:
        pred_info['weekday_is_saturday']=0
    if day(i)=="Sunday":
        pred_info['weekday_is_sunday']=1
        pred_info['is_weekend']=1
    else:
        pred_info['weekday_is_sunday']=0
        pred_info['is_weekend']=0
        
    pred_info['global_subjectivity']=analysis.sentiment.subjectivity
    pred_info['global_sentiment_polarity']=analysis.sentiment.polarity
    pred_info['global_rate_positive_words']=rates(article.text)[0]
    pred_info['global_rate_negative_words']=rates(article.text)[1]
    pred_info['avg_positive_polarity']=rates(article.text)[2]
    pred_info['min_positive_polarity']=rates(article.text)[3]
    pred_info['max_positive_polarity']=rates(article.text)[4]
    pred_info['avg_negative_polarity']=rates(article.text)[5]
    pred_info['min_negative_polarity']=rates(article.text)[6]
    pred_info['max_negative_polarity']=rates(article.text)[7]    
    pred_info['title_subjectivity']=title_analysis.sentiment.subjectivity
    pred_info['title_sentiment_polarity']=title_analysis.sentiment.polarity
    df2.append(pred_info)

In [24]:
pred_df=pd.DataFrame(df2)
pred_test=pred_df.drop(['text'],axis=1)
pred_df.head()

Unnamed: 0,average_token_length,avg_negative_polarity,avg_positive_polarity,data_channel_is_bus,data_channel_is_entertainment,data_channel_is_lifestyle,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,global_rate_negative_words,...,text,title_sentiment_polarity,title_subjectivity,weekday_is_friday,weekday_is_monday,weekday_is_saturday,weekday_is_sunday,weekday_is_thursday,weekday_is_tuesday,weekday_is_wednesday
0,4.123506,-0.375,0.282008,0,0,0,0,0,1,0.03,...,People wear face masks to protect against the ...,0.0,0.0,0,1,0,0,0,0,0
1,4.443548,-0.314286,0.285569,0,0,0,0,1,0,0.15,...,"Apr 22, 2020, 04:03PM IST\n\nSource: AP\n\nInd...",0.333333,0.6,0,0,0,1,0,0,0
2,4.706522,-0.293333,0.290794,0,0,0,0,1,0,0.26,...,"Apr 22, 2020, 03:58PM IST\n\nSource: AP\n\nIra...",0.0,0.0,0,1,0,0,0,0,0
3,4.107843,-0.258156,0.298278,0,0,0,0,1,0,0.37,...,"Apr 22, 2020, 04:04PM IST\n\nSource: AP\n\nAn ...",0.0,0.1,0,0,0,1,0,0,0
4,4.149351,-0.265134,0.317576,0,0,0,0,1,0,0.54,...,"Apr 22, 2020, 03:57PM IST\n\nSource: AP\n\nFou...",0.0,0.0,0,0,0,0,0,0,1


# Final Results depicting the Likelihood of Virality of News

In [25]:
test2=pd.DataFrame(clf.predict(pred_test),pred_df['text'])
test2.reset_index(level=0, inplace=True)
test2 = test2.rename(index=str, columns={"index": "News", 0: "Virality"})
test2

Unnamed: 0,text,Virality
0,People wear face masks to protect against the ...,15865.82
1,"Apr 22, 2020, 04:03PM IST\n\nSource: AP\n\nInd...",38793.47
2,"Apr 22, 2020, 03:58PM IST\n\nSource: AP\n\nIra...",29560.5
3,"Apr 22, 2020, 04:04PM IST\n\nSource: AP\n\nAn ...",24550.5
4,"Apr 22, 2020, 03:57PM IST\n\nSource: AP\n\nFou...",32007.47
5,"Apr 21, 2020, 09:25AM IST\n\nSource: Times Now...",39627.29
6,"Apr 21, 2020, 03:38PM IST\n\nSource: AP\n\nA s...",22519.37
7,"Apr 21, 2020, 03:42PM IST\n\nSource: AP\n\nPre...",32447.44
8,"Apr 21, 2020, 03:39PM IST\n\nSource: AP\n\nThe...",22282.4
9,"Apr 21, 2020, 03:43PM IST\n\nSource: AP\n\nSta...",29820.28


### Note: I don't Know whether these results are fully correct or not because I had to assume some of the parameters for the sake of generating results. I hope this work satisfies the problem statement.