In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [68]:
dfr=pd.read_csv('tempSiteRatings.csv')
dfd=pd.read_csv('tempUrlMetrics.csv')
dfd['rating']=dfr['rating']
dfd.to_csv('dataReg.csv')

In [69]:
data=pd.read_csv('dataReg.csv')
data.drop(['Unnamed: 0'],1, inplace=True)

In [70]:
def changeColName(data):
    try:
        columns=['word_count','body_text_percent','emphasized_text_percent','text_position_changes','text_cluster_count','link_count','page_size','graphic_percent','graphic_count','colour_count','font_count','colourfulness','visual_complexity','rating']
        data.columns=columns
    except:
        columns=['word_count','body_text_percent','emphasized_text_percent','text_position_changes','text_cluster_count','link_count','page_size','graphic_percent','graphic_count','colour_count','font_count','colourfulness','visual_complexity']
        data.columns=columns
    return data
def dropMetrics(data):
    try:
        data.drop(['id','url'],1, inplace=True)#,'p1','p2','p4','p6','p7','p10','p11','p12','p13'
    except:
        data.drop(['slno','url'],1, inplace=True)
    return data
def stdNormal(data,mean,std):
    x = data.values #returns a numpy array
    columns=data.columns
    x_scaled=(x-mean)/std
    data=pd.DataFrame(x_scaled,columns=columns)
    return data
def catCol(data,col):
    data[col+'_high']=np.zeros(data[col].shape)
    data[col+'_avg']=np.ones(data[col].shape)
    data[col+'_low']=np.zeros(data[col].shape)

    data[col+'_high'][np.where(data[col]>.431)[0]]=1.0
    data[col+'_avg'][np.where(data[col]>.431)[0]]=0.0
    data[col+'_avg'][np.where(data[col]<-.431)[0]]=0.0
    data[col+'_low'][np.where(data[col]<-.431)[0]]=1.0
    data.drop([col],1,inplace=True)
    return data
def catColPdf(data,col):
    import scipy.stats
    
    mean_high=np.mean(data[col][np.where(data[col]>.431)[0]])
    std_high=np.std(data[col][np.where(data[col]>.431)[0]])


    mean_avg=np.mean(data[col][np.where(abs(data[col])<.431)[0]])
    std_avg=np.std(data[col][np.where(abs(data[col])<.431)[0]])

    mean_low=np.mean(data[col][np.where(data[col]<-.431)[0]])
    std_low=np.std(data[col][np.where(data[col]<-.431)[0]])

    data[col+'_high']=scipy.stats.norm(mean_high,std_high).pdf(data[col].values)
    data[col+'_avg']=scipy.stats.norm(mean_avg,std_avg).pdf(data[col].values)
    data[col+'_low']=scipy.stats.norm(mean_low,std_low).pdf(data[col].values)
    
    data.drop([col],1,inplace=True)
    
    return data
def preProcess(data,mean,std):
    data=changeColName(data)
    try:
        rating=data['rating']
        data.drop(['rating'],1,inplace=True)
    except:
        pass
    data=stdNormal(data,mean,std)
    data=catCol(data,'word_count')
    data=catCol(data,'page_size') 
    data=catCol(data,'colourfulness') 
    data=catCol(data,'visual_complexity') 
    try:
        data['rating']=rating
    except:
        pass
    return data

In [71]:
data=dropMetrics(data)

In [72]:
mean=np.mean(data.values[:,:-1],axis=0)
std=np.std(data.values[:,:-1],axis=0)


In [73]:
data=preProcess(data,mean,std)
irrCol=['emphasized_text_percent','font_count']
data.drop(irrCol,1,inplace=True)



In [74]:
correlation = data.corr(method='pearson')
columns = correlation.nlargest(30,'rating').index


In [75]:
def corr():
    fig, ax = plt.subplots(figsize=(30,30))    
    correlation_map = np.corrcoef(data[columns].values.T)
    sns.set(font_scale=2.4)
    heatmap = sns.heatmap(correlation_map, cbar=True, annot=True, annot_kws={"size": 30},square=True, fmt='.2f',yticklabels=columns.values, xticklabels=columns.values)
    plt.rcParams["font.family"] = "Times New Roman"
    plt.rcParams["font.size"] = 50
    plt.savefig('corr')

In [76]:
columns=columns.tolist()
columns.sort()
columns=pd.Index(columns)

In [77]:
Y=data['rating'].values
X_temp=data[columns]
X_temp.drop(['rating'],1, inplace=True)
X=X_temp.values
columns=columns.drop('rating')

In [78]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle=True,test_size=0.3, random_state=1)

def RMS_test(reg):
    from sklearn.metrics import mean_squared_error
    Y_pred=reg.predict(X_test)
    return Y_pred,mean_squared_error(Y_test, Y_pred)
def RMS_train(reg):
    from sklearn.metrics import mean_squared_error
    Y_pred=reg.predict(X_train)
    return Y_pred,mean_squared_error(Y_train, Y_pred)
def RMS(reg):
    from sklearn.metrics import mean_squared_error
    Y_pred=reg.predict(X)
    return Y_pred,mean_squared_error(Y, Y_pred)
def getRMS(reg):
    print(RMS_test(reg)[1],RMS_train(reg)[1])
def getRV(reg):
    Y_pred=RMS_test(reg)[0]
    Y_pred=RMS(reg)[0]
    rsq=np.sum((Y_pred-np.mean(Y_pred))**2)/np.sum((Y-np.mean(Y))**2)
    N,p=X.shape
    adjRsq=1-((1-rsq)*(N-1)/(N-p-1))
    print(rsq,adjRsq)

In [80]:
columns

Index(['body_text_percent', 'colour_count', 'colourfulness_avg',
       'colourfulness_high', 'colourfulness_low', 'graphic_count',
       'graphic_percent', 'link_count', 'page_size_avg', 'page_size_high',
       'page_size_low', 'text_cluster_count', 'text_position_changes',
       'visual_complexity_avg', 'visual_complexity_high',
       'visual_complexity_low', 'word_count_avg', 'word_count_high',
       'word_count_low'],
      dtype='object')

In [81]:
col='rating ~ '
for i in columns:
    col+=i+' + '
col=col[:-3]
col

'rating ~ body_text_percent + colour_count + colourfulness_avg + colourfulness_high + colourfulness_low + graphic_count + graphic_percent + link_count + page_size_avg + page_size_high + page_size_low + text_cluster_count + text_position_changes + visual_complexity_avg + visual_complexity_high + visual_complexity_low + word_count_avg + word_count_high + word_count_low'

In [82]:
reg=smf.ols(col,data=data).fit()

In [83]:
reg=sm.OLS(Y,X).fit()

In [86]:
reg.fvalue,reg.f_pvalue

(2.200171833423492, 0.013013155561838076)

In [53]:
reg.params

array([-0.0484363 , -0.11096567,  0.91073359,  1.04038209,  1.10366847,
        0.28351828, -0.04507594, -0.0858187 ,  1.28335265,  1.69534203,
        0.07608947, -0.20395379, -0.13099741,  1.02781056,  1.1044421 ,
        0.92253149,  1.33764855,  0.64728108,  1.06985451])

In [54]:
columns

Index(['body_text_percent', 'colour_count', 'colourfulness_avg',
       'colourfulness_high', 'colourfulness_low', 'graphic_count',
       'graphic_percent', 'link_count', 'page_size_avg', 'page_size_high',
       'page_size_low', 'text_cluster_count', 'text_position_changes',
       'visual_complexity_avg', 'visual_complexity_high',
       'visual_complexity_low', 'word_count_avg', 'word_count_high',
       'word_count_low'],
      dtype='object')