In [3]:
?

In [4]:
import csv
import timm
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
import torch.backends.cudnn as cudnn
from torch.utils.data import Dataset
import pandas as pd
import warnings
import os
import copy
from PIL import Image
import math
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def r2(df):
    pearson = df.corr().iloc[0,1]
    spearman = df.corr('spearman').iloc[0, 1]
    kendall = df.corr('kendall').iloc[0, 1]
    df = df.dropna()
    x = df.iloc[:,0].values
    y = df.iloc[:,1].values
    # Fit a simple linear regression model
    beta, _, _, _ = np.linalg.lstsq(x[:, None], y)
    y_pred = beta * x
    # Calculate the residual sum of squares (RSS)
    rss = np.sum((y - y_pred)**2)
    # Calculate the total sum of squares (TSS)
    mean_y = np.mean(y)
    tss = np.sum((y - mean_y)**2)
    # Calculate R2
    r2 = 1 - (rss / tss)

    # Calculate the residual sum of squares (RSS)
    rss = np.sum((y - y_pred)**2)
    # Calculate the total sum of squares (TSS)
    mean_y = np.mean(y)
    tss = np.sum((y - mean_y)**2)
    # Calculate R2
    r2 = 1 - (rss / tss)
    # Calculate the adjusted R2
    n = x.shape[0]
    p = 1
    adj_r2 = 1 - (1-r2)*(n-1)/(n-p-1)
    
    y /= y.max()
    x /= x.max()
    
    # Calculate the residuals
    residuals = x - y
    # Calculate the mean squared error (MSE)
    mse = np.mean(residuals**2)
    # Calculate the root mean squared error (RMSE)
    rmse = np.sqrt(mse)
    # Calculate the absolute percentage error
    ape = np.abs((y - x) / y) * 100
    # Calculate the mean absolute percentage error (MAPE)
    mape = np.mean(ape)
    
    return {'pearson':pearson, 'spearman':spearman, 'kendall':kendall, 'r2':r2, 'adj_r2':adj_r2, 'rmse':rmse, 'mape':mape}

In [6]:
class TestDataset(Dataset):
    def __init__(self, transform=None):
        self.file_list = list(os.listdir(data_path))
        self.transform = transform        

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        name = self.file_list[idx]
        image = Image.open(f"{data_path}{name}").convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image,name
data_path = './data12/train/data_zl12/' 
_mean = [0.485, 0.456, 0.406]
_std = [0.229, 0.224, 0.225]
test_dataset = TestDataset(transform = transforms.Compose([transforms.Resize(224), transforms.ToTensor(), transforms.Normalize(mean=_mean, std=_std)]))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=0)                              

In [7]:
df = pd.read_csv('df.csv')

In [8]:
torch.cuda.empty_cache()

In [26]:
model = torch.load('eff-bin15.pt')

names, scores = [], []
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
    for batch_idx, (data,name) in tqdm(enumerate(test_loader), total=len(test_loader)):  
        data = data.to(device)
        score = model(data).squeeze()
        scores += list(score.cpu().data.numpy())
        names += list(name)
eff = pd.DataFrame({'name':names, 'eff0000':scores})

100%|██████████| 258/258 [13:19<00:00,  3.10s/it]


In [None]:
df = df.merge(eff)

In [None]:
r2(df[['eff00', 'log_gdp']])


In [23]:
r2(df[['eff00', 'log_gdp']])


  if __name__ == '__main__':


{'pearson': 0.33518292631385904,
 'spearman': 0.3828632920760336,
 'kendall': 0.26226022209906674,
 'r2': -80.70908973473077,
 'adj_r2': -80.70975051747094,
 'rmse': 0.900601736943778,
 'mape': 116.87303775666689}

In [56]:
model = torch.load('vgg-unlimited.pt')

names, scores = [], []
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
    for batch_idx, (data,name) in tqdm(enumerate(test_loader), total=len(test_loader)):
        data = data.to(device)
        score = model(data).squeeze()
        scores += list(score.cpu().data.numpy())
        names += list(name)
vgg = pd.DataFrame({'name':names, 'vgg':scores})

100%|██████████| 258/258 [14:47<00:00,  3.44s/it]


In [62]:
model = torch.load('vit-unlimited.pt')

names, scores = [], []
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
    for batch_idx, (data,name) in tqdm(enumerate(test_loader), total=len(test_loader)):
        data = data.to(device)
        score = model(data).squeeze()
        scores += list(score.cpu().data.numpy())
        names += list(name)
eff = pd.DataFrame({'name':names, 'vit':scores})

100%|██████████| 258/258 [09:34<00:00,  2.23s/it]


In [58]:
df = df.merge(eff)

In [59]:
df = df.merge(vgg)

In [64]:
df = df.merge(eff)

In [4]:
model = torch.load('res-bin15.pt')

In [8]:
names, scores = [], []
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
    for batch_idx, (data,name) in tqdm(enumerate(test_loader), total=len(test_loader)):
        data = data.to(device)
        score = model(data).squeeze()
        scores += list(score.cpu().data.numpy())
        names += list(name)
data = pd.DataFrame({'name':names, 'resb15':scores})

100%|██████████| 258/258 [09:41<00:00,  2.25s/it]


In [11]:
df = df.merge(data, left_on='name', right_on='name')

In [12]:
df['log_gdp'] = df['gdp'].map(np.log)

In [16]:
r2(df[['eff1', 'log_gdp']])

  if __name__ == '__main__':


{'pearson': 0.22361723841610415,
 'spearman': 0.2546298698253262,
 'kendall': 0.17148388550021973,
 'r2': -100.8289592125274,
 'adj_r2': -100.82978270497988,
 'rmse': 0.9418031705750289,
 'mape': 110.71496024504853}

In [66]:
r2(df[['vgg', 'log_gdp']])

  if __name__ == '__main__':


{'pearson': 0.3858252411821878,
 'spearman': 0.3971626492020586,
 'kendall': 0.2699910853275505,
 'r2': -93.91628360798822,
 'adj_r2': -93.91705119752045,
 'rmse': 0.9921648908679669,
 'mape': 121.8737231684787}

In [65]:
r2(df[['vit', 'log_gdp']])

  if __name__ == '__main__':


{'pearson': 0.5909179873972293,
 'spearman': 0.5495222622057283,
 'kendall': 0.3846805870146383,
 'r2': -24.99106100557484,
 'adj_r2': -24.991271195708723,
 'rmse': 0.5572076121857689,
 'mape': 72.7796141359473}

In [21]:
r2(df[['nl-50', 'log_gdp']])

  if __name__ == '__main__':


{'pearson': 0.30233065163955863,
 'spearman': 0.5029931674585858,
 'kendall': 0.35504494512737533,
 'r2': -70.44349085066321,
 'adj_r2': -70.44406861533791,
 'rmse': 0.7483089124266302,
 'mape': 98.25769417959842}

In [19]:
r2(df[['res-m', 'log_gdp']])

  if __name__ == '__main__':


{'pearson': 0.5441409810210416,
 'spearman': 0.5403589616996938,
 'kendall': 0.37916546898144254,
 'r2': -42.1526313365889,
 'adj_r2': -42.1529803126217,
 'rmse': 0.6037049418082139,
 'mape': 78.56575675430682}

In [16]:
r2(df[['nl-50', 'log_gdp']])

  if __name__ == '__main__':


{'pearson': 0.30233065163955863,
 'spearman': 0.5029931674585858,
 'kendall': 0.35504494512737533,
 'r2': -70.44349085066321,
 'adj_r2': -70.44406861533791,
 'rmse': 0.7483089124266302,
 'mape': 98.25769417959842}

In [15]:
r2(df[['resb15', 'log_gdp']])

  if __name__ == '__main__':


{'pearson': 0.41857358983562964,
 'spearman': 0.43996682247400215,
 'kendall': 0.3001788238671966,
 'r2': -81.85255445363859,
 'adj_r2': -81.85322448359656,
 'rmse': 0.6887719227431028,
 'mape': 87.8714041972913}

In [12]:
r2(df[['resb15', 'gdp']])

  if __name__ == '__main__':


{'pearson': 0.3369895516075426,
 'spearman': 0.43996682247400215,
 'kendall': 0.3001788238671966,
 'r2': -0.09936321974655749,
 'adj_r2': -0.09937211031483018,
 'rmse': 0.22236814071317088,
 'mape': 2404.6035442692855}

In [106]:
r2(df[['b50+', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.3798068612328396,
 'spearman': 0.42224809543321107,
 'kendall': 0.2918165645459292,
 'r2': -0.39789902734372107,
 'adj_r2': -0.3979103321759343,
 'rmse': 0.3949837208668952,
 'mape': 5536.190657655905}

In [67]:
print(r2(df.groupby('PAC').sum()[['res-m', 'gdp']]))
r2(df.groupby('PAC').sum()[['nl-50', 'gdp']])

{'pearson': 0.6282567581954062, 'spearman': 0.7016974656419451, 'kendall': 0.5179125258972532, 'r2': 0.39249433236387843, 'adj_r2': 0.39219697276151133, 'rmse': 0.046450690212443475, 'mape': 310.60540743462343}



`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.6345036630441131,
 'spearman': 0.7352745938435575,
 'kendall': 0.5504131598715784,
 'r2': 0.40240309917600336,
 'adj_r2': 0.40211058967976054,
 'rmse': 0.04453297374331508,
 'mape': 358.5788592765964}

In [101]:
r2(df[['b50_y', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.3609465820022717,
 'spearman': 0.42430504630908406,
 'kendall': 0.29386435494938357,
 'r2': -0.40525534904815386,
 'adj_r2': -0.4052667133710608,
 'rmse': 0.4451971592864422,
 'mape': 6483.164649811903}

In [97]:
r2(df[a['b50', 'gdp']])a


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.3880581359133165,
 'spearman': 0.4456177495363686,
 'kendall': 0.3092557923781333,
 'r2': -0.21520515020754405,
 'adj_r2': -0.21521497759139585,
 'rmse': 0.3041258532110962,
 'mape': 3417.987958772044}

In [93]:
r2(df[['nl-50', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.46142984728811526,
 'spearman': 0.5029931674585858,
 'kendall': 0.35504494512737533,
 'r2': 0.11675978090135652,
 'adj_r2': 0.11675263812331194,
 'rmse': 0.046847824993456064,
 'mape': 112.1632197723924}

In [88]:
r2(df[['I40_y', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.34596730369882245,
 'spearman': 0.4187137782017362,
 'kendall': 0.2922374238493872,
 'r2': 0.002901804154029186,
 'adj_r2': 0.002893740604671291,
 'rmse': 0.955147727965253,
 'mape': 15094.229116378867}

In [75]:
r2(df[['I20', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.3839928376339375,
 'spearman': 0.46778001928893276,
 'kendall': 0.32392581031272033,
 'r2': 0.010921749782880519,
 'adj_r2': 0.010913751090953605,
 'rmse': 0.9187233565817696,
 'mape': 14401.211866192838}

In [80]:
r2(df[['I40', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.36904558849149444,
 'spearman': 0.42931782062222207,
 'kendall': 0.2981876868890798,
 'r2': 0.005324920401458377,
 'adj_r2': 0.005316876447881036,
 'rmse': 0.940758681469123,
 'mape': 14836.181822070475}

In [70]:
r2(df[['I30', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.2524991809843522,
 'spearman': 0.3482577608991025,
 'kendall': 0.23706411056179547,
 'r2': -0.010047846203480537,
 'adj_r2': -0.010056014476871766,
 'rmse': 0.0509735485217046,
 'mape': 131.54558242752245}

In [57]:
r2(df[['I3', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.39757829522786114,
 'spearman': 0.46319319471060083,
 'kendall': 0.3231975888052216,
 'r2': 0.007614494463483501,
 'adj_r2': 0.007606469025729035,
 'rmse': 0.9239935469192845,
 'mape': 14538.30923787915}

In [64]:
r2(df[['nl100', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.2524991809843522,
 'spearman': 0.3482577608991025,
 'kendall': 0.23706411056179547,
 'r2': -0.010047846203480537,
 'adj_r2': -0.010056014476871766,
 'rmse': 0.0509735485217046,
 'mape': 131.54558242752245}

In [76]:
r2(df[['nl-50', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.46142984728811526,
 'spearman': 0.5029931674585858,
 'kendall': 0.35504494512737533,
 'r2': 0.11675978090135652,
 'adj_r2': 0.11675263812331194,
 'rmse': 0.046847824993456064,
 'mape': 112.1632197723924}

In [47]:
r2(df[['10b100', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.26847199448531234,
 'spearman': 0.26558063650414837,
 'kendall': 0.17838555362277694,
 'r2': -0.0007515984010433296,
 'adj_r2': -0.0007596914955272371,
 'rmse': 0.9849096219597147,
 'mape': 15610.7257183788}

In [43]:
a


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.30483595755784076,
 'spearman': 0.33038537798630024,
 'kendall': 0.22341388083421196,
 'r2': -0.43300373772901524,
 'adj_r2': -0.43301532645359364,
 'rmse': 0.08747911811944899,
 'mape': 933.6307621294866}

In [37]:
r2(df[['can', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.3679126502455629,
 'spearman': 0.2929216402805242,
 'kendall': 0.198800618968884,
 'r2': 0.06494663379913068,
 'adj_r2': 0.06493907200732119,
 'rmse': 0.047407936188694086,
 'mape': 185.4010155904088}

In [34]:
r2(df[['can', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.3679126502455629,
 'spearman': 0.2929216402805242,
 'kendall': 0.198800618968884,
 'r2': 0.06494663379913068,
 'adj_r2': 0.06493907200732119,
 'rmse': 0.047407936188694086,
 'mape': 185.4010155904088}

In [28]:
r2(df[['vit', 'gdp']])a


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.4665368189641487,
 'spearman': 0.5495222622057283,
 'kendall': 0.3846805870146383,
 'r2': 0.21667059206545958,
 'adj_r2': 0.21666425726777294,
 'rmse': 0.21909102638553668,
 'mape': 1676.3766349950179}

In [29]:
r2(df[['nl-50', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.46142984728811526,
 'spearman': 0.5029931674585858,
 'kendall': 0.35504494512737533,
 'r2': 0.11675978090135652,
 'adj_r2': 0.11675263812331194,
 'rmse': 0.046847824993456064,
 'mape': 112.1632197723924}

In [22]:
r2([['res-m', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.38710724386789785,
 'spearman': 0.5633368559979537,
 'kendall': 0.40810798719716385,
 'r2': 0.14836623147722094,
 'adj_r2': 0.1479493769649729,
 'rmse': 0.34113375302750587,
 'mape': 1248.4316358116128}

In [19]:
r2(df.groupby('PAC').mean()[['try_y', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.47766386409352724,
 'spearman': 0.6418454968843796,
 'kendall': 0.4630413038159058,
 'r2': 0.0621347462783709,
 'adj_r2': 0.06167568350121877,
 'rmse': 0.7446527313545779,
 'mape': 4538.416161277413}

In [18]:
r2(df[['try_x', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.39061184475105454,
 'spearman': 0.472039734861121,
 'kendall': 0.3236861261927195,
 'r2': -0.12326002101020594,
 'adj_r2': -0.1232691048322998,
 'rmse': 0.19988304343658256,
 'mape': 2068.5827236102336}

In [10]:
r2(df[['res-m', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.36238704398897276,
 'spearman': 0.5403589616996938,
 'kendall': 0.37916546898144254,
 'r2': 0.09723733763627918,
 'adj_r2': 0.09723003697991783,
 'rmse': 0.20246679935522802,
 'mape': 1687.4788795490074}

In [24]:
r2(df[['res-m', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.38710724386789785,
 'spearman': 0.5633368559979537,
 'kendall': 0.40810798719716385,
 'r2': 0.14836623147722094,
 'adj_r2': 0.1479493769649729,
 'rmse': 0.34113375302750587,
 'mape': 1248.4316358116128}

In [None]:
df = pd.read_csv('df.csv')`
data = pd.read_csv('data.csv')
df.columns = ['name', 'nightlights', 'score', 'PAC', 'population', 'gdp', 'host',
       'second', 'newloss', 'rank25', 'rank10', 'rank10-', 'rank10-drop',
       'nl-res', 'res-rankloss-10bins-100', 'nl', 'res-m', 'pca', 'ae',
       'nl-50', 'res50+', 'res50++', 'res2loss', 'res1loss', 'name_y', 'first',
       'code', 'name_x', 'res1corr', 'mid', 'im', 'imm_x', 'name_y', 'imm_y']

In [25]:
r2(df[['immmm', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.38165152792587603,
 'spearman': 0.46658016191830537,
 'kendall': 0.3222811686841678,
 'r2': 5.2063061708906844e-05,
 'adj_r2': 4.3976466448381046e-05,
 'rmse': 0.14577346285226395,
 'mape': 1232.9856622094635}

In [12]:
r2(df[['imm_y', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.4379138633970226,
 'spearman': 0.4298089755487276,
 'kendall': 0.2946642955422226,
 'r2': 0.1542196021500164,
 'adj_r2': 0.15421276231015668,
 'rmse': 0.09989749712580036,
 'mape': 785.0873834035491}

In [279]:
r2(df[['nl-50', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.46142984728811526,
 'spearman': 0.5029931674585858,
 'kendall': 0.35504494512737533,
 'r2': 0.11675978090135652,
 'adj_r2': 0.11675263812331194,
 'rmse': 0.046847824993456064,
 'mape': 112.1632197723924}

In [278]:
r2(df[['res-m', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.36238704398897276,
 'spearman': 0.5403589616996938,
 'kendall': 0.37916546898144254,
 'r2': 0.09723733763627918,
 'adj_r2': 0.09723003697991783,
 'rmse': 0.20246679935522802,
 'mape': 1687.4788795490074}

In [277]:
r2(df[['im', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.32816611275583646,
 'spearman': 0.5053476624828702,
 'kendall': 0.3533417118169351,
 'r2': 0.08140896523060559,
 'adj_r2': 0.08140153656993865,
 'rmse': 0.21356005974997305,
 'mape': 1815.6957997392103}

In [269]:
r2(df[['nl-50', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.46142984728811526,
 'spearman': 0.5029931674585858,
 'kendall': 0.35504494512737533,
 'r2': 0.11675978090135652,
 'adj_r2': 0.11675263812331194,
 'rmse': 0.046847824993456064,
 'mape': 112.1632197723924}

In [154]:
data.columns = ['name', '25bins']

In [161]:
df.columns = ['name', 'cluster_lat', 'cluster_lon', 'nightlights', 'predict',
       'score', 'lo', 'lat', 'PAC', 'name_y', 'population', 'code_x',
       'name_x.1', 'gdp', 'code_y', 'name_y.1', 'host', 'code_x.1', 'name_x',
       'second', 'code_y.1', 'name_y', 's']

In [92]:
df = df.merge(data, left_on='name', right_on='name')

In [93]:
df.to_csv('df.csv', index=False)

In [115]:
df = df.drop(['Unnamed: 0'], axis=1)

In [116]:
pca = np.load('pac_y.npy')

In [121]:
pca = pd.DataFrame({'name': data.name, 'pca':pca})

In [124]:
df.to_csv('df.csc', index=False)

In [95]:
df[['nl', 'gdp']].corr('spearman')

Unnamed: 0,nl,gdp
nl,1.0,0.03601
gdp,0.03601,1.0


In [94]:
df[['res-m', 'gdp']].corr('spearman')

Unnamed: 0,res-m,gdp
res-m,1.0,0.541525
gdp,0.541525,1.0


In [86]:
df[['nightlights', 'gdp']].corr('spearman')

Unnamed: 0,nightlights,gdp
nightlights,1.0,0.391717
gdp,0.391717,1.0


In [85]:
df.groupby('PAC').mean()[['nightlights', 'gdp']].corr('spearman')

Unnamed: 0,nightlights,gdp
nightlights,1.0,0.698051
gdp,0.698051,1.0


In [79]:
df.groupby('PAC').mean()[['nl', 'gdp']].corr('spearman')

Unnamed: 0,nl,gdp
nl,1.0,0.166461
gdp,0.166461,1.0


In [70]:
df.groupby('PAC').mean()[['res-rankloss-10bins-100', 'gdp']].corr('spearman')

Unnamed: 0,res-rankloss-10bins-100,gdp
res-rankloss-10bins-100,1.0,0.477945
gdp,0.477945,1.0


In [49]:
df.groupby('PAC').mean()[['rank10-drop', 'gdp']].corr('spearman')

Unnamed: 0,rank10-drop,gdp
rank10-drop,1.0,0.413136
gdp,0.413136,1.0


In [37]:
df.to_csv('df_model3_25_bins.csv', index=False)

In [10]:
df = pd.read_csv('df.csv')

In [None]:
data

In [61]:
df = df.merge(data)

In [None]:
df

In [37]:
df.groupby('PAC').mean()[['rank10', 'gdp']].corr('spearman')

Unnamed: 0,rank10,gdp
rank10,1.0,0.56399
gdp,0.56399,1.0


In [36]:
df.groupby('PAC').mean()[['rank10-', 'nightlights']].corr('spearman')

Unnamed: 0,rank10-,nightlights
rank10-,1.0,0.668047
nightlights,0.668047,1.0


In [None]:
df.to_csv()

In [64]:
df.groupby('PAC').mean()[['nightlights', 'nl-res']].corr('spearman')

Unnamed: 0,nightlights,nl-res
nightlights,1.0,0.140072
nl-res,0.140072,1.0


In [40]:
df.groupby('PAC').mean()[['rank10-', 'gdp']].corr('spearman')

Unnamed: 0,rank10-,gdp
rank10-,1.0,0.565897
gdp,0.565897,1.0


In [71]:
df.groupby('PAC').mean()[['nightlights', 'gdp']].corr()

Unnamed: 0,nightlights,gdp
nightlights,1.0,0.587653
gdp,0.587653,1.0


In [39]:
df.groupby('PAC').mean()[['rank10', 'gdp']].corr()

Unnamed: 0,rank10,gdp
rank10,1.0,0.429859
gdp,0.429859,1.0


In [18]:
df.groupby('PAC').mean()[['rank25', 'gdp']].corr('spearman')

Unnamed: 0,rank25,gdp
rank25,1.0,0.486958
gdp,0.486958,1.0


In [1]:
import pandas as pd

In [12]:
df = pd.read_csv('df_model1.csv')

In [13]:
cp = pd.read_csv('./center2PA.csv')

In [14]:
df = df.merge(cp)

In [15]:
population = pd.read_csv('population.csv')
gdp = pd.read_csv('地区生产总值.csv')
second = pd.read_csv('第二产业增加值.csv')
host = pd.read_csv('医院、卫生院床位数.csv')

In [16]:
gdp['PAC'] = gdp.code.map(lambda x: str(int(x))) 

In [89]:
df = df.merge(second, left_on='PAC', right_on='PAC')

KeyError: 'PAC'

In [48]:
df.columns = ['name_x', 'cluster_lat', 'cluster_lon', 'nightlights', 'predict',
       'score', 'lo', 'lat', 'PAC', 'name_y', 'population', 'code_x', 'name_x',
       'gdp', 'code_y', 'name_y', 'host', 'code_x', 'name', 'second', 'code_y']

In [30]:
df[['nightlights', 'score']].corr(method='spearman')

Unnamed: 0,nightlights,score
nightlights,1.0,0.807044
score,0.807044,1.0


In [50]:
df[['s', 'second']].corr(method='spearman')

Unnamed: 0,score,second
score,1.0,0.283507
second,0.283507,1.0


In [80]:
df

Unnamed: 0,name_x,cluster_lat,cluster_lon,nightlights,predict,score,lo,lat,PAC,name_y,...,name_x.1,gdp,code_y,name_y.1,host,code_x,name,second,code_y.1,s


In [79]:
df[['s', 'second']].corr(method='spearman')

Unnamed: 0,s,second
s,,
second,,


In [165]:
df[['25bins', 'gdp']].corr(method='spearman')

Unnamed: 0,25bins,nightlights
25bins,1.0,0.744819
nightlights,0.744819,1.0


In [51]:
df[['score', 'gdp']].corr(method='spearman')

Unnamed: 0,score,gdp
score,1.0,0.366257
gdp,0.366257,1.0


In [53]:
df[['nightlights', 'gdp']].corr()

Unnamed: 0,nightlights,gdp
nightlights,1.0,0.302683
gdp,0.302683,1.0


In [55]:
df_mean = df.groupby('PAC').mean()

In [56]:
df_mean[['nightlights', 'gdp']].corr()

Unnamed: 0,nightlights,gdp
nightlights,1.0,0.587653
gdp,0.587653,1.0


In [58]:
df_mean[['score', 'gdp']].corr('spearman')

Unnamed: 0,score,gdp
score,1.0,0.537587
gdp,0.537587,1.0


In [None]:
df_mean[['score', 'gdp']].corr('spearman')

In [78]:
df_mean[['s', 'gdp']].corr('spearman')

KeyError: "['s'] not in index"

In [61]:
df.to_csv('total.csv', index=False)

In [103]:
df = pd.read_csv('total.csv')

In [106]:
df = df.merge(data, left_on='name_x', right_on='name')

In [108]:
df[['s', 'gdp']].corr(method='spearman')

Unnamed: 0,s,gdp
s,1.0,0.409483
gdp,0.409483,1.0


In [111]:
df[['score', 'gdp']].corr(method='spearman')

Unnamed: 0,score,gdp
score,1.0,0.366257
gdp,0.366257,1.0


In [113]:
mean2 = df.groupby('PAC').mean()

In [114]:
mean2[['s', 'gdp']].corr(method='spearman')

Unnamed: 0,score,gdp
score,1.0,0.537587
gdp,0.537587,1.0


In [115]:
mean2[['s', 'gdp']].corr(method='spearman')

Unnamed: 0,s,gdp
s,1.0,0.548823
gdp,0.548823,1.0


In [None]:
mean2[['gdp']]

In [116]:
vit = pd.read_csv('predict_score_vit.csv')

In [126]:
vit = vit[['predict', 'adcode']]
vit.columns = ['vit', 'PAC']

In [128]:
mean2

Unnamed: 0_level_0,cluster_lat,cluster_lon,nightlights,predict,score,lo,lat,population,code_x,gdp,code_y,host,code_x.1,second,code_y.1,s
PAC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
110115,39.675591,116.373047,9.440997,-0.866667,1.000000,116.373047,39.675591,69.9000,110115.0,6445617.0,110115.0,6999.0,110115.0,2432187.0,110115.0,1.000000
110116,40.678212,116.592407,0.940845,-0.968358,0.678315,116.592407,40.678212,28.4000,110116.0,2857990.0,110116.0,1711.0,110116.0,1627653.0,110116.0,0.584633
110117,40.203244,117.126243,2.361866,-0.818184,0.827348,117.126243,40.203244,40.4400,110117.0,2335531.0,110117.0,1979.0,110117.0,954094.0,110117.0,0.885964
110118,40.511727,116.990177,1.091290,-0.794126,0.851780,116.990177,40.511727,43.6700,110118.0,2782407.0,110118.0,1695.0,110118.0,1122291.0,110118.0,0.837834
110119,40.550596,116.112608,1.138501,-1.000000,0.783872,116.112608,40.550596,28.5300,110119.0,1361674.0,110119.0,1037.0,110119.0,428434.0,110119.0,0.777499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654326,47.425945,86.150008,0.595704,-0.977952,0.936382,86.150008,47.425945,3.8722,654326.0,128125.0,654326.0,145.0,654326.0,60535.0,654326.0,0.870551
659002,40.588227,81.300102,0.612419,-0.971014,0.988039,81.300102,40.588227,16.8813,659002.0,2389406.0,659002.0,2296.0,659002.0,766331.0,659002.0,0.933301
659003,39.861157,79.140015,1.278825,-0.937500,0.997374,79.140015,39.861157,16.1067,659003.0,829208.0,659003.0,838.0,659003.0,383663.0,659003.0,0.927028
659004,44.346471,87.578125,1.994666,-1.000000,0.994986,87.578125,44.346471,9.6091,659004.0,1624126.0,659004.0,855.0,659004.0,1153902.0,659004.0,0.991083


In [130]:
mean3 = mean2.merge(vit, left_on='PAC', right_on='PAC')

In [132]:
mean3[['vit', 'gdp']].corr(method='spearman')

Unnamed: 0,vit,gdp
vit,1.0,0.669352
gdp,0.669352,1.0


In [167]:
df.groupby('PAC').mean()[['25bins', 'gdp']].corr(method='spearman')

Unnamed: 0,25bins,gdp
25bins,1.0,0.532972
gdp,0.532972,1.0


In [133]:
res = pd.read_csv('predict_score_res.csv')

In [134]:
res

Unnamed: 0,PAC,predict,adcode,name,childrenNu,level,parent,subFeature,geometry,ID
0,,,110101,东城区,0,district,110000,,"POLYGON ((116.44364 39.872846, 116.443509 39.8...",0
1,110102.0,0.521103,110102,西城区,0,district,110000,,"POLYGON ((116.325809 39.896793, 116.325809 39....",1
2,110105.0,0.013687,110105,朝阳区,0,district,110000,,"MULTIPOLYGON (((116.595547 40.017517, 116.5982...",2
3,,,110106,丰台区,0,district,110000,,"POLYGON ((116.25909 39.896667, 116.260052 39.8...",3
4,,,110107,石景山区,0,district,110000,,"POLYGON ((116.16703 39.888756, 116.166362 39.8...",4
...,...,...,...,...,...,...,...,...,...,...
2843,659006.0,0.368107,659006,铁门关市,0,city,650000,,"MULTIPOLYGON (((85.741466 41.755319, 85.742772...",2843
2844,659007.0,0.649770,659007,双河市,0,city,650000,,"MULTIPOLYGON (((82.206964 44.821218, 82.178471...",2844
2845,,,659008,可克达拉市,0,city,650000,,"MULTIPOLYGON (((80.67237299999999 43.844698, 8...",2845
2846,,,659009,昆玉市,0,city,650000,,"MULTIPOLYGON (((79.573441 37.38167, 79.579897 ...",2846


In [None]:
df

In [141]:
res = res[['predict', 'adcode']]
res.columns = ['res', 'PAC']

In [142]:
mean4 = mean3.merge(res, left_on='PAC', right_on='PAC')

In [137]:
mean3[['vit', 'gdp']].corr(method='spearman')

Unnamed: 0,vit,gdp
vit,1.0,0.669352
gdp,0.669352,1.0


In [143]:
mean4[['res', 'gdp']].corr(method='spearman')

Unnamed: 0,res,gdp
res,1.0,0.655352
gdp,0.655352,1.0


In [None]:
mean3[['vit', 'gdp']].corr(method='spearman')

In [20]:
data

Unnamed: 0,name,score
0,-48.779296875_-66.93006025862447.png,0.030561
1,-44.82421875_-75.97355295343337.png,0.030572
2,-44.82421875_-70.98834922412489.png,0.030384
3,-39.111328125_-65.4034447883078.png,-0.072293
4,-50.80078125_-77.86034459764656.png,0.030548
...,...,...
131947,-38.671875_-69.28725695167886.png,0.030430
131948,-46.142578125_-72.63337363853837.png,0.030591
131949,-44.208984375_-66.08936427047087.png,0.030453
131950,-25.224609375_-74.59010800882324.png,0.030507


In [21]:
df

Unnamed: 0,name,cluster_lat,cluster_lon,nightlights,predict,score,lo,lat,PAC
0,-48.779296875_-66.93006025862447.png,43.771094,90.966797,0.382294,9.996539e-01,9.996539e-01,90.966797,43.771094,652328
1,-44.82421875_-75.97355295343337.png,40.847060,120.058594,0.477784,1.000000e+00,1.000000e+00,120.058594,40.847060,211422
2,-44.82421875_-70.98834922412489.png,40.847060,102.304688,0.273093,1.096327e-09,1.096327e-09,102.304688,40.847060,152922
3,-39.111328125_-65.4034447883078.png,36.385913,87.275391,0.385570,9.796304e-01,9.796304e-01,87.275391,36.385913,652825
4,-50.80078125_-77.86034459764656.png,45.213004,128.320312,0.341066,8.097361e-01,8.097361e-01,128.320312,45.213004,230183
...,...,...,...,...,...,...,...,...,...
131947,-38.671875_-69.28725695167886.png,36.031332,97.382812,0.283661,-1.000000e+00,2.063789e-02,97.382812,36.031332,632822
131948,-46.142578125_-72.63337363853837.png,41.836828,107.666016,0.303960,-1.000000e+00,1.516602e-01,107.666016,41.836828,150824
131949,-44.208984375_-66.08936427047087.png,40.380028,88.945312,0.315802,-1.000000e+00,5.814294e-01,88.945312,40.380028,652824
131950,-25.224609375_-74.59010800882324.png,24.447150,114.609375,0.450516,-1.000000e+00,4.475327e-05,114.609375,24.447150,441623


In [None]:
df

In [170]:
df.iloc[:,1]

0         43.771094
1         43.644026
2         43.897892
3         43.516689
4         44.024422
            ...    
123284    35.245619
123285    35.173808
123286    34.741612
123287    27.839076
123288    21.779905
Name: cluster_lat, Length: 123289, dtype: float64

In [96]:
def r2(df):
    pearson = df.corr().iloc[0,1]
    spearman = df.corr('spearman').iloc[0, 1]
    
    df = df.dropna()
    x = df.iloc[:,0].values
    y = df.iloc[:,1].values
    # Fit a simple linear regression model
    beta, _, _, _ = np.linalg.lstsq(x[:, None], y)
    y_pred = beta * x
    # Calculate the residual sum of squares (RSS)
    rss = np.sum((y - y_pred)**2)
    # Calculate the total sum of squares (TSS)
    mean_y = np.mean(y)
    tss = np.sum((y - mean_y)**2)
    # Calculate R2
    r2 = 1 - (rss / tss)

    # Calculate the residual sum of squares (RSS)
    rss = np.sum((y - y_pred)**2)
    # Calculate the total sum of squares (TSS)
    mean_y = np.mean(y)
    tss = np.sum((y - mean_y)**2)
    # Calculate R2
    r2 = 1 - (rss / tss)
    # Calculate the adjusted R2
    n = x.shape[0]
    p = 1
    adj_r2 = 1 - (1-r2)*(n-1)/(n-p-1)
    
    y /= y.max()
    x /= x.max()
    
    # Calculate the residuals
    residuals = x - y
    # Calculate the mean squared error (MSE)
    mse = np.mean(residuals**2)
    # Calculate the root mean squared error (RMSE)
    rmse = np.sqrt(mse)
    # Calculate the absolute percentage error
    ape = np.abs((y - x) / y) * 100
    # Calculate the mean absolute percentage error (MAPE)
    mape = np.mean(ape)
    
    return pearson, spearman, r2, adj_r2, rmse, mape

In [201]:
r2(mean3[['vit', 'population']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



(0.47557335718005206,
 0.5376174386421668,
 0.17397047007970357,
 0.17351485533622857,
 0.22462418959259514,
 204.84850863982916)

In [202]:
r2(mean3[['s', 'population']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



(0.37828324088248194,
 0.44295472624393395,
 0.07178287763657332,
 0.07132964661979435,
 0.5624187117961017,
 532.2711304095155)

In [199]:
r2(mean3[['vit', 'host']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



(0.47956800872087096,
 0.5374079711652435,
 0.20550743898753676,
 0.2050692191524499,
 0.2690739377736923,
 358.99218305000284)

In [198]:
r2(mean3[['vit', 'second']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



(0.5441056218757786,
 0.6565444357756609,
 0.26795220745868387,
 0.2675484304081923,
 0.32896666455018864,
 2092.8019683359344)

In [191]:
r2(mean3[['s', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



(0.13203389162360402,
 0.1316100800472484,
 0.6843306603866338,
 2562.258817193747)

In [None]:
r2(mean3[['vit', 'gdp']])

In [None]:
import numpy as np
# Define the true values
y_true = np.array([1, 2, 3, 4, 5])
# Define the predicted values
y_pred = np.array([0.8, 1.9, 2.7, 4.1, 4.8])
# Calculate the absolute percentage error
ape = np.abs((y_true - y_pred) / y_true) * 100
# Calculate the mean absolute percentage error (MAPE)
mape = np.mean(ape)
print('MAPE: ', mape)

In [None]:
res

In [99]:
res = pd.read_csv('predict_score_res.csv')
# res = res[['name', 'predict']]

In [100]:
res

Unnamed: 0,PAC,predict,adcode,name,childrenNu,level,parent,subFeature,geometry,ID
0,,,110101,东城区,0,district,110000,,"POLYGON ((116.44364 39.872846, 116.443509 39.8...",0
1,110102.0,0.521103,110102,西城区,0,district,110000,,"POLYGON ((116.325809 39.896793, 116.325809 39....",1
2,110105.0,0.013687,110105,朝阳区,0,district,110000,,"MULTIPOLYGON (((116.595547 40.017517, 116.5982...",2
3,,,110106,丰台区,0,district,110000,,"POLYGON ((116.25909 39.896667, 116.260052 39.8...",3
4,,,110107,石景山区,0,district,110000,,"POLYGON ((116.16703 39.888756, 116.166362 39.8...",4
...,...,...,...,...,...,...,...,...,...,...
2843,659006.0,0.368107,659006,铁门关市,0,city,650000,,"MULTIPOLYGON (((85.741466 41.755319, 85.742772...",2843
2844,659007.0,0.649770,659007,双河市,0,city,650000,,"MULTIPOLYGON (((82.206964 44.821218, 82.178471...",2844
2845,,,659008,可克达拉市,0,city,650000,,"MULTIPOLYGON (((80.67237299999999 43.844698, 8...",2845
2846,,,659009,昆玉市,0,city,650000,,"MULTIPOLYGON (((79.573441 37.38167, 79.579897 ...",2846


In [94]:
df

Unnamed: 0.1,Unnamed: 0,name,nightlights,score,PAC,population,gdp,host,second,newloss
0,0,-48.779296875_-66.93006025862447.png,0.382294,0.999654,652328,8.8985,282062.00,430.0,55819.00,0.030561
1,1,-48.603515625_-66.68778386116202.png,0.332832,0.997266,652328,8.8985,282062.00,430.0,55819.00,0.030480
2,2,-48.955078125_-66.99884379185184.png,0.316878,0.998954,652328,8.8985,282062.00,430.0,55819.00,0.030526
3,3,-48.427734375_-66.96447630005638.png,0.303142,0.491546,652328,8.8985,282062.00,430.0,55819.00,0.030506
4,4,-49.130859375_-66.82652027497478.png,0.353151,0.999952,652328,8.8985,282062.00,430.0,55819.00,0.030552
...,...,...,...,...,...,...,...,...,...,...
123284,123284,-37.705078125_-74.16408546675687.png,0.853912,1.000000,410822,40.5300,2605347.00,1846.0,1657451.00,0.030643
123285,123285,-37.6171875_-74.16408546675687.png,3.321868,1.000000,410822,40.5300,2605347.00,1846.0,1657451.00,0.030629
123286,123286,-37.08984375_-73.84928645675248.png,3.990151,1.000000,411281,16.7900,1369734.00,1230.0,944290.00,0.030661
123287,123287,-29.00390625_-74.16408546675687.png,8.351891,1.000000,430304,47.5200,5784480.00,3560.0,3300668.00,0.030583


In [97]:
df.merge(res, left_on='name', right_on='name')

Unnamed: 0.1,Unnamed: 0,name,nightlights,score,PAC,population,gdp,host,second,newloss,predict


In [89]:
df = df.merge(res)

In [92]:
df = pd.read_csv('df.csv')

In [90]:
df.to_csv('df.csv', index=False)

In [None]:
# pearson, spearman, r2, adj_r2, rmse, mape

In [98]:
r2(df.groupby('PAC').mean()[['res-m', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



(0.3888542029536748,
 0.5641770678476032,
 0.1496376978910472,
 0.14922248192322052,
 0.34134934496177316,
 1247.1349358207356)

In [101]:
r2(df.groupby('PAC').mean()[['nl', 'nightlights']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



(0.0432946646729968,
 0.18298412755035168,
 0.00010689623011961302,
 -0.00038133282445551586,
 0.9288629880983515,
 7406.709789747735)

In [193]:
def r2(df):
    pearson = df.corr().iloc[0,1]
    spearman = df.corr('spearman').iloc[0, 1]
    kendall = df.corr('kendall').iloc[0, 1]
    df = df.dropna()
    x = df.iloc[:,0].values
    y = df.iloc[:,1].values
    # Fit a simple linear regression model
    beta, _, _, _ = np.linalg.lstsq(x[:, None], y)
    y_pred = beta * x
    # Calculate the residual sum of squares (RSS)
    rss = np.sum((y - y_pred)**2)
    # Calculate the total sum of squares (TSS)
    mean_y = np.mean(y)
    tss = np.sum((y - mean_y)**2)
    # Calculate R2
    r2 = 1 - (rss / tss)

    # Calculate the residual sum of squares (RSS)
    rss = np.sum((y - y_pred)**2)
    # Calculate the total sum of squares (TSS)
    mean_y = np.mean(y)
    tss = np.sum((y - mean_y)**2)
    # Calculate R2
    r2 = 1 - (rss / tss)
    # Calculate the adjusted R2
    n = x.shape[0]
    p = 1
    adj_r2 = 1 - (1-r2)*(n-1)/(n-p-1)
    
    y /= y.max()
    x /= x.max()
    
    # Calculate the residuals
    residuals = x - y
    # Calculate the mean squared error (MSE)
    mse = np.mean(residuals**2)
    # Calculate the root mean squared error (RMSE)
    rmse = np.sqrt(mse)
    # Calculate the absolute percentage error
    ape = np.abs((y - x) / y) * 100
    # Calculate the mean absolute percentage error (MAPE)
    mape = np.mean(ape)
    
    return {'pearson':pearson, 'spearman':spearman, 'kendall':kendall, 'r2':r2, 'adj_r2':adj_r2, 'rmse':rmse, 'mape':mape}

In [198]:
df.columns

Index(['name', 'nightlights', 'score', 'PAC', 'population', 'gdp', 'host',
       'second', 'newloss', 'rank25', 'rank10', 'rank10-', 'rank10-drop',
       'nl-res', 'res-rankloss-10bins-100', 'nl', 'res-m', 'pca', 'ae',
       'nl-50', 'res50+', 'res50++', 'res2loss', 'res1loss'],
      dtype='object')

In [207]:
items = ['gdp', 'second', 'population', 'host', 'nightlights', 'pca', 'ae','nl-50', 'res-m']
host = pd.DataFrame([r2(df[[item, 'gdp']]) for item in items])
host.index = items


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machin

In [208]:
items = ['gdp', 'second', 'population', 'host', 'nightlights', 'pca', 'ae','nl-50', 'res-m']
host = pd.DataFrame([r2(df[[item,  'host']]) for item in items])
host.index = items


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machin

In [233]:
first = pd.read_csv('第一产业增加值.csv')
first['code'] = first['code'].apply(int)
first = first.rename({'value':'first'}, axis=1)

In [234]:
df = df.merge(first, left_on='PAC', right_on='code')

In [22]:
items = ['gdp', 'first', 'second', 'population', 'host', 'nightlights', 'pca', 'ae','nl-50', 'res-m']

In [23]:
log_gdp = pd.DataFrame([r2(df[[item,  'gdp']]) for item in items])
log_gdp.index = items

  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':


In [241]:
gdp = pd.DataFrame([r2(df[[item,  'gdp']]) for item in items])
gdp.index = items

first = pd.DataFrame([r2(df[[item,  'first']]) for item in items])
first.index = items

second = pd.DataFrame([r2(df[[item,  'second']]) for item in items])
second.index = items

population = pd.DataFrame([r2(df[[item,  'population']]) for item in items])
population.index = items

host = pd.DataFrame([r2(df[[item, 'host']]) for item in items])
host.index = items


# = pd.DataFrame([r2(df[[item, 'gdp']]) for item in items])
# host.index = items


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machin


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machin

In [24]:
log_gdp[['pearson', 'spearman', 'kendall']]

Unnamed: 0,pearson,spearman,kendall
gdp,1.0,1.0,1.0
first,0.541045,0.818233,0.638678
second,0.957478,0.939339,0.799389
population,0.629154,0.827765,0.639556
host,0.696495,0.84417,0.65242
nightlights,0.311088,0.392843,0.273156
pca,0.185212,0.382493,0.261243
ae,0.0693,0.076812,0.051964
nl-50,0.46143,0.502993,0.355045
res-m,0.362387,0.540359,0.379165


In [242]:
gdp[['pearson', 'spearman', 'kendall']]

Unnamed: 0,pearson,spearman,kendall
gdp,1.0,1.0,1.0
first,0.541045,0.818233,0.638678
second,0.957478,0.939339,0.799389
population,0.629154,0.827765,0.639556
host,0.696495,0.84417,0.65242
nightlights,0.311088,0.392843,0.273156
pca,0.185212,0.382493,0.261243
ae,0.0693,0.076812,0.051964
nl-50,0.46143,0.502993,0.355045
res-m,0.362387,0.540359,0.379165


In [238]:
first[['pearson', 'spearman', 'kendall']]

Unnamed: 0,pearson,spearman,kendall
gdp,0.541045,0.818233,0.638678
first,1.0,1.0,1.0
second,0.395661,0.663937,0.480522
population,0.742463,0.831202,0.644218
host,0.660039,0.796488,0.606152
nightlights,0.103624,0.414653,0.281641
pca,0.312283,0.416577,0.282977
ae,0.051884,0.0636,0.042889
nl-50,0.219285,0.49434,0.342493
res-m,0.430778,0.529399,0.368152


In [239]:
second[['pearson', 'spearman', 'kendall']]

Unnamed: 0,pearson,spearman,kendall
gdp,0.957478,0.939339,0.799389
first,0.395661,0.663937,0.480522
second,1.0,1.0,1.0
population,0.51542,0.696805,0.513036
host,0.589186,0.719106,0.529224
nightlights,0.283866,0.324825,0.22325
pca,0.122175,0.30594,0.20837
ae,0.064118,0.080957,0.054343
nl-50,0.421627,0.451617,0.315527
res-m,0.305508,0.475799,0.329088


In [240]:
population[['pearson', 'spearman', 'kendall']]

Unnamed: 0,pearson,spearman,kendall
gdp,0.629154,0.827765,0.639556
first,0.742463,0.831202,0.644218
second,0.51542,0.696805,0.513036
population,1.0,1.0,1.0
host,0.88282,0.943342,0.793416
nightlights,0.179187,0.380516,0.26377
pca,0.322051,0.463555,0.315051
ae,0.058155,0.077375,0.051834
nl-50,0.302175,0.524664,0.365268
res-m,0.437916,0.516072,0.357345


In [244]:
host[['pearson', 'spearman', 'kendall']]

Unnamed: 0,pearson,spearman,kendall
gdp,0.696495,0.84417,0.65242
first,0.660039,0.796488,0.606152
second,0.589186,0.719106,0.529224
population,0.88282,0.943342,0.793416
host,1.0,1.0,1.0
nightlights,0.21884,0.371398,0.259354
pca,0.291226,0.428463,0.290497
ae,0.051619,0.069411,0.046735
nl-50,0.326953,0.497071,0.346035
res-m,0.407833,0.500987,0.347509


In [210]:
items = ['gdp', 'second', 'population', 'host', 'nightlights', 'pca', 'ae','nl-50', 'res-m']
second = pd.DataFrame([r2(df[[item,  'second']]) for item in items])
second.index = items


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


`rcond` parameter will change to the default of machin

In [214]:
!find *.xlsx

economicdata.xlsx


In [None]:
index

In [223]:
second[['pearson', 'spearman', 'kendall']]

Unnamed: 0,pearson,spearman,kendall
gdp,0.960166,0.938796,0.798801
second,1.0,1.0,1.0
population,0.511029,0.694758,0.511187
host,0.581797,0.717058,0.52732
nightlights,0.267879,0.322925,0.221736
pca,0.121826,0.304954,0.207768
ae,0.064734,0.081764,0.054876
nl-50,0.415607,0.450504,0.314594
res-m,0.304285,0.476183,0.329266


In [221]:
population[['pearson', 'spearman', 'kendall']]

Unnamed: 0,pearson,spearman,kendall
gdp,0.625772,0.82658,0.638187
second,0.511029,0.694758,0.511187
population,1.0,1.0,1.0
host,0.88371,0.943129,0.793129
nightlights,0.167387,0.37921,0.262688
pca,0.321988,0.463117,0.314845
ae,0.058756,0.078273,0.052427
nl-50,0.297446,0.524153,0.364791
res-m,0.438049,0.516768,0.357788


In [145]:
r2(df.groupby('PAC').mean()[['nightlights', 'nightlights']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 1.0,
 'spearman': 1.0,
 'kendall': 1.0,
 'r2': 1.0,
 'adj_r2': 1.0,
 'rmse': 0.0,
 'mape': 0.0}

In [146]:
r2(df.groupby('PAC').mean()[['ae', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.28175278202395776,
 'spearman': 0.2815419009816775,
 'kendall': 0.18921319564327388,
 'r2': 0.007547663172684516,
 'adj_r2': 0.007063067305093096,
 'rmse': 3565509.7867204314,
 'mape': 99.99991754198003}

In [149]:
r2(df.groupby('PAC').mean()[['res-m', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.3888542029536748,
 'spearman': 0.5641770678476032,
 'kendall': 0.4087936355346081,
 'r2': 0.1496376978910472,
 'adj_r2': 0.14922248192322052,
 'rmse': 3565509.876883327,
 'mape': 99.99994915351292}

In [186]:
r2(df.groupby('PAC').mean()[['nl-50', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.6803982627029886,
 'spearman': 0.7163714886329444,
 'kendall': 0.5294198450785279,
 'r2': 0.4285096510842009,
 'adj_r2': 0.42823060306226934,
 'rmse': 3565508.929885141,
 'mape': 99.99991142286936}

In [163]:
r2(df[['res50+', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.12275113715614638,
 'spearman': 0.0948687344322813,
 'kendall': 0.064759102599424,
 'r2': -0.005928036396859682,
 'adj_r2': -0.00593619563535519,
 'rmse': 2016514.758796866,
 'mape': 99.99988116777232}

In [160]:
r2(df[['nl-50', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.4590602476146543,
 'spearman': 0.5025867337931158,
 'kendall': 0.35463330935843584,
 'r2': 0.11332372434639448,
 'adj_r2': 0.11331653237744677,
 'rmse': 2016514.3462801117,
 'mape': 99.99985782607938}

In [162]:
r2(df[['res-m', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.3636008242305105,
 'spearman': 0.5415252512162172,
 'kendall': 0.3799445659186035,
 'r2': 0.09782135267017833,
 'adj_r2': 0.09781403495908692,
 'rmse': 2016514.6333417895,
 'mape': 99.99994318452254}

In [108]:
r2(df.groupby('PAC').mean()[['nl', 'nightlights']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.0432946646729968,
 'spearman': 0.18298412755035168,
 'r2': 0.00010689623011961302,
 'adj_r2': -0.00038133282445551586,
 'rmse': 2.14740165120097,
 'mape': 50.5570938885364}

In [185]:
r2(df.groupby('PAC').mean()[['res1loss', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.5875092570819308,
 'spearman': 0.4356162645270833,
 'kendall': 0.30585506732981693,
 'r2': 0.23928135746999057,
 'adj_r2': 0.2389099128203177,
 'rmse': 3565509.607092224,
 'mape': 99.99993665256915}

In [190]:
r2(df[['res-m', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.3636008242305105,
 'spearman': 0.5415252512162172,
 'kendall': 0.3799445659186035,
 'r2': 0.09782135267017833,
 'adj_r2': 0.09781403495908692,
 'rmse': 2016514.6333417895,
 'mape': 99.99994318452254}

In [191]:
r2(df[['nl-50', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.4590602476146543,
 'spearman': 0.5025867337931158,
 'kendall': 0.35463330935843584,
 'r2': 0.11332372434639448,
 'adj_r2': 0.11331653237744677,
 'rmse': 2016514.3462801117,
 'mape': 99.99985782607938}

In [192]:
r2(df[['pca', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.18551807249781757,
 'spearman': 0.38159824666846737,
 'kendall': 0.2607310389980199,
 'r2': 0.0343445980223519,
 'adj_r2': 0.03433676544144726,
 'rmse': 2016514.5643257548,
 'mape': 99.99976581678001}

In [187]:
r2(df[['res1loss', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.25237223549505056,
 'spearman': 0.0882469545282074,
 'kendall': 0.059507585730569086,
 'r2': -0.10052473423852248,
 'adj_r2': -0.1005336607655225,
 'rmse': 2016514.6117314184,
 'mape': 99.99985443640423}

In [173]:
r2(df.groupby('PAC').mean()[['res50++', 'gdp']])


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



{'pearson': 0.39759805195992665,
 'spearman': 0.12067813391869274,
 'kendall': 0.08272738591871022,
 'r2': 0.13480339190113144,
 'adj_r2': 0.1343809326198332,
 'rmse': 3565510.0197179867,
 'mape': 99.99993834283482}

In [153]:
x = timm.Resnet()

AttributeError: module 'timm' has no attribute 'Resnet'

In [29]:
df['log_first'] = df['first'].map(np.log)
df['log_second'] = df.second.map(np.log)
df['log_population'] = df.population.map(np.log)
df['log_host'] = df.host.map(np.log)

In [48]:
log_gdp = pd.DataFrame([r2(df[[item,  f'log_{it}']]) for item in items])
log_gdp.index = items
log_gdp = log_gdp.reset_index()
log_gdp = log_gdp[['index', 'pearson', 'spearman']]
log_gdp.columns = ['index', f'{it}_pearson', f'{it}_spearman']

In [75]:
r2(df[['resb15','log_gdp']])

  if __name__ == '__main__':


{'pearson': 0.41857358983562964,
 'spearman': 0.43996682247400215,
 'kendall': 0.3001788238671966,
 'r2': -81.85255445363859,
 'adj_r2': -81.85322448359656,
 'rmse': 0.6887719227431028,
 'mape': 87.8714041972913}

In [69]:
items = ['res-m', 'vit', 'eff', 'vgg']

In [72]:
res

Unnamed: 0,index,gdp_pearson,gdp_spearman,first_pearson,first_spearman,second_pearson,second_spearman,population_pearson,population_spearman,host_pearson,host_spearman
0,res-m,0.544141,0.540359,0.514694,0.529399,0.485231,0.475799,0.503338,0.516072,0.499468,0.500987
1,vit,0.590918,0.549522,0.515851,0.515488,0.545173,0.510706,0.499958,0.477045,0.502207,0.472732
2,eff,0.252305,0.323762,0.211602,0.284612,0.239804,0.302594,0.229541,0.302505,0.228481,0.288736
3,vgg,0.385825,0.397163,0.391049,0.4038,0.336382,0.333956,0.471334,0.472933,0.424388,0.425199


In [71]:
for it in ['gdp', 'first', 'second', 'population', 'host']:
    log_gdp = pd.DataFrame([r2(df[[item,  f'log_{it}']]) for item in items])
    log_gdp.index = items
    log_gdp = log_gdp.reset_index()
    log_gdp = log_gdp[['index', 'pearson', 'spearman']]
    log_gdp.columns = ['index', f'{it}_pearson', f'{it}_spearman']
    if it != 'gdp':
        res = res.merge(log_gdp)
    else:
        res = log_gdp

  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':


In [49]:
for it in ['first', 'second', 'population', 'host']:
    log_gdp = pd.DataFrame([r2(df[[item,  f'log_{it}']]) for item in items])
    log_gdp.index = items
    log_gdp = log_gdp.reset_index()
    log_gdp = log_gdp[['index', 'pearson']]
    log_gdp.columns = ['index', it]
    res = res.merge(log_gdp)
# = pd.DataFrame([r2(df[[item, 'gdp']]) for item in items])
# host.index = items

  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':


  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':


In [50]:
res

Unnamed: 0,index,gdp,first,second,population,host
0,gdp,0.715014,0.467647,0.691549,0.542645,0.573387
1,first,0.679708,0.821352,0.550101,0.699872,0.663789
2,second,0.646074,0.357086,0.670269,0.436029,0.47394
3,population,0.693514,0.664699,0.61271,0.83326,0.785243
4,host,0.707086,0.614115,0.629403,0.771466,0.829593
5,nightlights,0.189148,0.094344,0.178024,0.151712,0.161095
6,pca,0.311397,0.339096,0.251604,0.388912,0.363358
7,ae,0.071693,0.056554,0.076407,0.069084,0.063187
8,nl-50,0.302331,0.204368,0.284477,0.262638,0.265018
9,res-m,0.544141,0.514694,0.485231,0.503338,0.499468


In [37]:
pd.merge(log_gdp.reset_index(), log_host.reset_index())

Unnamed: 0,index,pearson,spearman,kendall,r2,adj_r2,rmse,mape


In [31]:
log_host[['pearson']]

Unnamed: 0,pearson
gdp,0.573387
first,0.663789
second,0.47394
population,0.785243
host,0.829593
nightlights,0.161095
pca,0.363358
ae,0.063187
nl-50,0.265018
res-m,0.499468


In [33]:
log_population[['pearson']]

Unnamed: 0,pearson
gdp,0.542645
first,0.699872
second,0.436029
population,0.83326
host,0.771466
nightlights,0.151712
pca,0.388912
ae,0.069084
nl-50,0.262638
res-m,0.503338


In [None]:
log_host[['pearson']]

In [51]:
part = 'host'

res = df.groupby('PAC').mean().merge(province, left_on='code_y', right_on='code').groupby('region')[['res-n', part]].corr()
print('data1=', res[res!=1][[part]].reset_index(level=1).dropna().to_dict()[part])

print('y1=', df.groupby('PAC').mean()[['res-n', part]].corr().iloc[0,1])

res = df.groupby('PAC').mean().merge(province, left_on='code_y', right_on='code').groupby('region')[['res-n', part]].corr('spearman')
print('data2=', res[res!=1][[part]].reset_index(level=1).dropna().to_dict()[part])

print('y2=', df.groupby('PAC').mean()[['res-n', part]].corr('spearman').iloc[0,1])

NameError: name 'province' is not defined