# Análisis de noiticias

## Preparación de ambiente

### Carga de módulos

In [None]:
# Data Wrangling
import pandas as pd

# Data Visualization
import cufflinks as cf

# Modeling
from sklearn.linear_model import LinearRegression

# Model performance
from sklearn.metrics import r2_score

cf.go_offline()

## Data Wrangling

### Carga de Datos

In [None]:
df = pd.read_csv("./OnlineNewsPopularity.csv")

In [None]:
df.loc[0, "url"]

'http://mashable.com/2013/01/07/amazon-instant-video-browser/'

In [None]:
df.columns = [x.strip() for x in df.columns]

In [None]:
df.info(memory_usage="deep")

### Clasificación de variables

In [None]:
ls_cont = [x for x in df.columns if x not in ["shares", "url", "timedelta"]]
target = "shares"

### Limpieza

In [None]:
df[ls_cont].isna().mean().to_frame()

In [None]:
df[ls_cont].nunique().sort_values().to_frame()

In [None]:
%%timeit
df[ls_cont].nunique()

53.5 ms ± 1.77 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
df[ls_cont].apply(lambda x: len(x.unique()))

51.3 ms ± 1.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
df[target].iplot(kind="hist", theme="solar")

In [None]:
df[target].describe(percentiles=[0.95, 0.99])

#### Outliers

In [None]:
dc_ol = {variable: df[variable].quantile(0.99) for variable in ls_cont+[target]}

In [None]:
dc_ol

In [None]:
aux = df.copy()

In [None]:
for variable, bound in dc_ol.items():
    aux = aux[aux[variable]<=bound]

In [None]:
aux.shape

(31221, 61)

In [None]:
(df.shape[0] / aux.shape[0])**-1

0.7875340530723438

In [None]:
df = df[df[target]<=dc_ol[target]].reset_index(drop=True)

In [None]:
X = df[ls_cont]
y = df[target]

In [None]:
X.columns

## Modelado

In [None]:
df['title_sentiment_polarity'].iplot(kind="hist", theme="solar")

In [None]:
df[['global_rate_positive_words', 'global_rate_negative_words']]

In [None]:
X.columns.shape

(58,)

### Selección de variables

In [None]:
ls_features = ['n_tokens_title', 
               'n_tokens_content', 
               'num_hrefs', 
               'num_imgs', 
               'num_videos', 
               'is_weekend', 
               'title_sentiment_polarity', 
               'global_sentiment_polarity', 
               'num_keywords', 
               'global_rate_positive_words']

#### Entrenamiento del modelo

In [None]:
linreg = LinearRegression()

In [None]:
linreg.fit(X=X[ls_features], y=y)

LinearRegression()

### Interpretación

In [None]:
pd.DataFrame(data=zip(ls_features, linreg.coef_), columns=["variable", "coeficient"]).sort_values(by="coeficient")

Unnamed: 0,variable,coeficient
7,global_sentiment_polarity,-186.700074
0,n_tokens_title,-13.862037
1,n_tokens_content,-0.447227
2,num_hrefs,20.026217
3,num_imgs,31.168166
4,num_videos,37.635155
8,num_keywords,52.195306
6,title_sentiment_polarity,361.473826
5,is_weekend,623.512383
9,global_rate_positive_words,2896.209324


In [None]:
df["global_sentiment_polarity"].describe()

count    39247.000000
mean         0.119292
std          0.096613
min         -0.393750
25%          0.057887
50%          0.119231
75%          0.177669
max          0.727841
Name: global_sentiment_polarity, dtype: float64

In [None]:
df["global_sentiment_polarity"].iplot(kind="hist")

In [None]:
df[ls_features].describe(percentiles=[x/10 for x in range(10)])

Unnamed: 0,n_tokens_title,n_tokens_content,num_hrefs,num_imgs,num_videos,is_weekend,title_sentiment_polarity,global_sentiment_polarity,num_keywords,global_rate_positive_words
count,39247.0,39247.0,39247.0,39247.0,39247.0,39247.0,39247.0,39247.0,39247.0,39247.0
mean,10.395138,546.870589,10.843045,4.517848,1.244197,0.130914,0.071164,0.119292,7.221673,0.039635
std,2.112312,470.336701,11.263465,8.285559,4.106577,0.337311,0.264761,0.096613,1.910654,0.017426
min,2.0,0.0,0.0,0.0,0.0,0.0,-1.0,-0.39375,1.0,0.0
0%,2.0,0.0,0.0,0.0,0.0,0.0,-1.0,-0.39375,1.0,0.0
10%,8.0,153.0,2.0,0.0,0.0,0.0,-0.133333,0.0,5.0,0.018942
20%,9.0,218.0,4.0,1.0,0.0,0.0,0.0,0.041708,5.0,0.025751
30%,9.0,277.0,5.0,1.0,0.0,0.0,0.0,0.072236,6.0,0.030769
40%,10.0,335.0,6.0,1.0,0.0,0.0,0.0,0.096501,7.0,0.035018
50%,10.0,410.0,7.0,1.0,0.0,0.0,0.0,0.119231,7.0,0.039024


### Desempeño del modelo

In [None]:
y.describe()

count    39247.000000
mean      2682.871455
std       3646.993331
min          1.000000
25%        942.000000
50%       1400.000000
75%       2700.000000
max      31600.000000
Name: shares, dtype: float64

In [None]:
y_pred = linreg.predict(X[ls_features])

In [None]:
(y - y_pred).abs().mean()

2095.1836449154307

In [None]:
((y - y_pred)**2).mean()

13068012.095028572

In [None]:
# MAPE: Mean Absolute Percentage Error
# PEMA: Porcentaje del Error Medio Absoluto

$\displaystyle{MAPE = 100\%*\frac{|y-\hat{y}|}{y}}$

In [None]:
"{:,.2%}".format((abs(y-y_pred)/y).mean())

'147.98%'

In [None]:
r2_score?

In [None]:
r2_score(y_true=y, y_pred=y_pred)

0.017459060002567584

In [None]:
(y==y_pred).mean()

0.0

### Modelo con todas las variables

In [None]:
linreg = LinearRegression()

In [None]:
linreg.fit(X, y)

LinearRegression()

In [None]:
linreg.score(X, y)

0.0685186995685062

In [None]:
pd.DataFrame(data=zip(ls_cont, abs(linreg.coef_)), columns=["variable", "coeficient"]).sort_values(by="coeficient")

In [None]:
aux = pd.DataFrame(data=zip(ls_cont, linreg.coef_), columns=["variable", "coeficient"]).sort_values(by="coeficient")

In [None]:
aux["coeficient_abs"] = aux["coeficient"].abs()

In [None]:
ls_best = aux.sort_values(by="coeficient_abs").tail(10)["variable"]

In [None]:
ls_best

3               n_non_stop_words
46           rate_positive_words
42           global_subjectivity
44    global_rate_positive_words
45    global_rate_negative_words
37                        LDA_00
41                        LDA_04
40                        LDA_03
38                        LDA_01
39                        LDA_02
Name: variable, dtype: object

In [None]:
linreg_ = LinearRegression()

In [None]:
linreg_.fit(X[ls_best], y)

LinearRegression()

In [None]:
linreg_.score(X[ls_best], y)

0.027828041322779384

In [None]:
pd.DataFrame(data=zip(ls_best, linreg_.coef_), columns=["variable", "coeficient"]).sort_values(by="coeficient").round()

Unnamed: 0,variable,coeficient
9,LDA_02,-2129433.0
8,LDA_01,-2128942.0
5,LDA_00,-2128390.0
6,LDA_04,-2128296.0
7,LDA_03,-2127479.0
3,global_rate_positive_words,-2956.0
0,n_non_stop_words,-2040.0
1,rate_positive_words,226.0
2,global_subjectivity,2873.0
4,global_rate_negative_words,8083.0


In [None]:
df[target].describe()

count    39247.000000
mean      2682.871455
std       3646.993331
min          1.000000
25%        942.000000
50%       1400.000000
75%       2700.000000
max      31600.000000
Name: shares, dtype: float64