In [280]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import statsmodels.formula.api as smf

In [281]:
dfr=pd.read_csv('tempSiteRatings.csv')
dfd=pd.read_csv('tempUrlMetrics.csv')
dfd['rating']=dfr['rating']
dfd.to_csv('dataReg.csv')

In [282]:
data=pd.read_csv('dataReg.csv')
data.drop(['Unnamed: 0'],1, inplace=True)

In [284]:
def changeColName(data):
    try:
        columns=['word_count','body_text_percent','emphasized_text_percent','text_position_changes','text_cluster_count','link_count','page_size','graphic_percent','graphic_count','colour_count','font_count','colourfulness','visual_complexity','rating']
        data.columns=columns
    except:
        columns=['word_count','body_text_percent','emphasized_text_percent','text_position_changes','text_cluster_count','link_count','page_size','graphic_percent','graphic_count','colour_count','font_count','colourfulness','visual_complexity']
        data.columns=columns
    return data
def dropMetrics(data):
    try:
        data.drop(['id','url'],1, inplace=True)#,'p1','p2','p4','p6','p7','p10','p11','p12','p13'
    except:
        data.drop(['slno','url'],1, inplace=True)
    return data
def stdNormal(data,mean,std):
    x = data.values #returns a numpy array
    columns=data.columns
    x_scaled=(x-mean)/std
    data=pd.DataFrame(x_scaled,columns=columns)
    return data
def catCol(data,col):
    data[col+'_high']=np.zeros(data[col].shape)
    data[col+'_avg']=np.ones(data[col].shape)
    data[col+'_low']=np.zeros(data[col].shape)

    data[col+'_high'][np.where(data[col]>.431)[0]]=1.0
    data[col+'_avg'][np.where(data[col]>.431)[0]]=0.0
    data[col+'_avg'][np.where(data[col]<-.431)[0]]=0.0
    data[col+'_low'][np.where(data[col]<-.431)[0]]=1.0
    data.drop([col],1,inplace=True)
    return data
def catColPdf(data,col):
    import scipy.stats
    
    mean_high=np.mean(data[col][np.where(data[col]>.431)[0]])
    std_high=np.std(data[col][np.where(data[col]>.431)[0]])


    mean_avg=np.mean(data[col][np.where(abs(data[col])<.431)[0]])
    std_avg=np.std(data[col][np.where(abs(data[col])<.431)[0]])

    mean_low=np.mean(data[col][np.where(data[col]<-.431)[0]])
    std_low=np.std(data[col][np.where(data[col]<-.431)[0]])

    data[col+'_high']=scipy.stats.norm(mean_high,std_high).pdf(data[col].values)
    data[col+'_avg']=scipy.stats.norm(mean_avg,std_avg).pdf(data[col].values)
    data[col+'_low']=scipy.stats.norm(mean_low,std_low).pdf(data[col].values)
    
    data.drop([col],1,inplace=True)
    
    return data
def preProcess(data,mean,std):
    data=changeColName(data)
    try:
        rating=data['rating']
        data.drop(['rating'],1,inplace=True)
    except:
        pass
    data=stdNormal(data,mean,std)
    data=catCol(data,'word_count')
    data=catCol(data,'page_size') 
    data=catCol(data,'colourfulness') 
    data=catCol(data,'visual_complexity') 
    try:
        data['rating']=rating
    except:
        pass
    return data

In [285]:
data=dropMetrics(data)

In [286]:
mean=np.mean(data.values[:,:-1],axis=0)
std=np.std(data.values[:,:-1],axis=0)


In [287]:
data=preProcess(data,mean,std)
irrCol=['emphasized_text_percent','font_count']
data.drop(irrCol,1,inplace=True)



In [288]:
correlation = data.corr(method='pearson')
columns = correlation.nlargest(30,'rating').index


In [289]:
def corr():
    fig, ax = plt.subplots(figsize=(30,30))    
    correlation_map = np.corrcoef(data[columns].values.T)
    sns.set(font_scale=2.4)
    heatmap = sns.heatmap(correlation_map, cbar=True, annot=True, annot_kws={"size": 30},square=True, fmt='.2f',yticklabels=columns.values, xticklabels=columns.values)
    plt.rcParams["font.family"] = "Times New Roman"
    plt.rcParams["font.size"] = 50
    plt.savefig('corr')

In [290]:
columns=columns.tolist()
columns=pd.Index(columns)

In [342]:
Y=data['rating'].values
X_temp=data[columns]
X_temp.drop(['rating'],1, inplace=True)
X=X_temp.values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [343]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle=True,test_size=0.3, random_state=1)

def LinearReg():
    from sklearn.linear_model import LinearRegression
    reg =LinearRegression(normalize=True)
    return reg
def RidgeReg():
    from sklearn.linear_model import Ridge
    reg =Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,normalize=False, random_state=1, solver='svd', tol=0.001)
    return reg
def MLPReg():
    from sklearn.neural_network import MLPRegressor
    reg=MLPRegressor(hidden_layer_sizes=(23,),activation='logistic',solver='adam', max_iter=10000,batch_size=10,alpha=0.01)
    return reg
def KNNReg():
    from sklearn.neighbors import KNeighborsRegressor
    reg=KNeighborsRegressor(n_neighbors=5,algorithm='brute',metric='manhattan')
    return reg
def RMS_test(reg):
    from sklearn.metrics import mean_squared_error
    Y_pred=reg.predict(X_test)
    return Y_pred,mean_squared_error(Y_test, Y_pred)
def RMS_train(reg):
    from sklearn.metrics import mean_squared_error
    Y_pred=reg.predict(X_train)
    return Y_pred,mean_squared_error(Y_train, Y_pred)
def RMS(reg):
    from sklearn.metrics import mean_squared_error
    Y_pred=reg.predict(X)
    return Y_pred,mean_squared_error(Y, Y_pred)
def getRMS(reg):
    print(RMS_test(reg)[1],RMS_train(reg)[1])
def getRV(reg):
    Y_pred=RMS_test(reg)[0]
    Y_pred=RMS(reg)[0]
    rsq=np.sum((Y_pred-np.mean(Y_pred))**2)/np.sum((Y-np.mean(Y))**2)
    N,p=X.shape
    adjRsq=1-((1-rsq)*(N-1)/(N-p-1))
    print(rsq,adjRsq)

In [344]:
regLinear=LinearReg()
regLinear.fit(X_train,Y_train)

regRidge=RidgeReg()
regRidge.fit(X_train,Y_train)

regMLP=MLPReg()
regMLP.fit(X_train,Y_train)

regKNN=KNNReg()
regKNN.fit(X_train,Y_train)

temp=0

In [345]:
getRMS(regLinear)
getRMS(regRidge)
getRMS(regMLP)
getRMS(regKNN)

0.8244547791465318 0.45500635514235627
0.7921293832493561 0.4602724245133802
0.775600358935138 0.05102469558420704
0.8474273231200308 0.48182256787147865


In [346]:
getRV(regLinear)
getRV(regRidge)
getRV(regMLP)
getRV(regKNN)

0.39267532081341644 0.23674060588713142
0.3193192136739357 0.14454982259021643
0.7344168543956037 0.6662265872809614
0.1442937546246311 -0.07541460567445002


In [333]:
try:
    print(regLinear.coef_)
    print(regLinear.intercept_)
except: 
    pass

[ 0.3160963   0.07044509  0.25844812  0.62915258  0.04112125 -0.06122946
 -0.12353413  0.02977424  0.15341925 -0.18755524  0.05310072 -0.08870703
 -0.1361118   0.07389667 -0.18135576 -0.16963069 -0.30916479 -0.16206067
 -1.28297824]
4.3516689439821885


In [240]:
def rateCountry(reg,regName):
    df={}
    irrCol=['emphasized_text_percent','font_count']
    country=['India','Usa']
    for year in country:
        df[str(year)]=pd.read_csv('yearMetrics/tempUrlMetrics'+str(year)+'.csv')
        df[str(year)]=dropMetrics(df[str(year)])
        df[str(year)]=preProcess(df[str(year)],mean,std)
        df[str(year)].drop(irrCol,1,inplace=True)
    meanYears=np.zeros(2)
    stdYears=np.zeros(2)

    ind=-1
    countryRating=[0,0]
    for year in country:
        ind+=1
        countryRating[ind]=reg.predict(df[str(year)][columns[1:]].values)
    x =countryRating[0]
    m=max(x)
    interval=None
    interval=[0,6]
    num_bins = 100
    fig, ax = plt.subplots(figsize=(20,12))
    plt.rcParams["font.family"] = "Times New Roman"
    plt.rcParams["font.size"] = 36
    plt.xlabel("Rating")
    plt.ylabel(" Number of Indian Sites")
    n, bins, patches = plt.hist(x,num_bins, range=interval,facecolor='blue', alpha=0.5)
    plt.savefig("histIndia"+regName)
    plt.show()
    x =countryRating[1]
    m=max(x)
    interval=None
    interval=[0,6]
    num_bins = 100
    fig, ax = plt.subplots(figsize=(20,12))
    plt.rcParams["font.family"] = "Times New Roman"
    plt.rcParams["font.size"] = 36
    plt.xlabel("Rating")
    plt.ylabel(" Number of Usa Sites")
    n, bins, patches = plt.hist(x,num_bins, range=interval,facecolor='blue', alpha=0.5)
    plt.savefig("histUsa"+regName)
    plt.show()

In [241]:
def showEvolution(reg,regName):
    df={}
    irrCol=['emphasized_text_percent','font_count']
    for year in range(2000,2020):
        df[str(year)]=pd.read_csv('yearMetrics/tempMpUrlMetrics'+str(year)+'.csv')
        df[str(year)]=dropMetrics(df[str(year)])
        df[str(year)]=preProcess(df[str(year)],mean,std)
        df[str(year)].drop(irrCol,1,inplace=True)
    meanYears=np.zeros(20)
    stdYears=np.zeros(20)
    for year in range(2000,2020):
        meanYears[year-2000]=np.mean(reg.predict(df[str(year)][columns[1:]].values))
        stdYears[year-2000]=np.std(reg.predict(df[str(year)][columns[1:]].values))
    fig, ax = plt.subplots(figsize=(6,6))
    x=range(2000,2020)
    y=meanYears
    e=stdYears*0
    plt.errorbar(x, y, e, fmt='-o')
    plt.xticks(np.arange(min(x), max(x)+1, 5))
    plt.rcParams["font.family"] = "Times New Roman"
    plt.rcParams["font.size"] = 16
    plt.ylabel('Website rating')
    plt.xlabel('Year')
    plt.savefig('modelReg'+regName)

In [None]:
showEvolution(regLinear,"Linear")
showEvolution(regRidge,"Ridge")
showEvolution(regMLP,"MLP")
showEvolution(regKNN,"KNN")

In [None]:
rateCountry(regLinear,"Linear")
rateCountry(regRidge,"Ridge")
rateCountry(regMLP,"MLP")
rateCountry(regKNN,"KNN")