In [27]:
import os
import pandas as pd
import numpy as np
from utils.myutils import load_object, save_object, long_name, short_name
from sklearn.linear_model import LinearRegression
from scipy import stats
from math import sqrt

In [2]:
unit_conversion_dict = load_object('unit_conversion_dict')
unit_dict = load_object('unit_dict')
unit_reverse_dict = load_object('unit_reverse_dict')

In [331]:
directory = os.fsencode("data")
directory_name = directory.decode("utf-8")

#Parameter
#Minimum number of occurrences of one token to treat as a possible unit
min_count = 100
#Only for manual units: Minimum number to treat as possible for evaluation
min_count_manual = 100

#Minimum R^2 of first regression model to treat a token as a possible unit
min_r2 = 0.5

#Minimum R^2 of final regression model to treat a token as an actual unit
min_final_r2 = 0.85

#Minimum Number of Instances to keep
min_instances = 15


#Parameter
#Write new findings to unit_conversion_dict?
insert_ground_truth = True
#Overwrite already existing findings in unit_conversion_dict when new R^2 is higher?
overwrite_ground_truth = True
#Always overwrite already existing findings in unit_conversion_dict? -> Set to False usually!
overwrite_ground_truth_hard = False


for file in os.listdir(directory):
        
    file_name = os.fsdecode(file)
    if not file_name.endswith(".csv"):
        continue
    
    try:
        to_unit = unit_dict['<' + long_name(file_name[:-4]) + '>']
        if to_unit is None:
            continue
    except KeyError:
        continue
    
    print('--- ' + file_name + '---')
         
    try:
        dtypes = load_object('data_info/' + file_name[:-4] + '_dtypes')
    except FileNotFoundError:
        print('WARNING! No dtype information available for ' + file_name + '!')
        dtypes = None
            
    data = pd.read_csv(directory_name + "/" + file_name,
                       encoding = "utf-8",
                       dtype = dtypes,
                       sep = ',')

    
    
    
    lines = data.shape[0]
    
    group = data.groupby('InfoTokenAfter').count().sort_values('IntegersInAbstract', ascending=False)['IntegersInAbstract']
    
    #print(type(group))
    #print(group)
    
    for token, count in group.iteritems():
        
        if token == to_unit:
            continue
            
        try:
             if unit_conversion_dict[(token, to_unit, False)] == False: #No meaningful formula
                    print(token + ' has declared to be no unit and will be excluded.')
                    continue
        except KeyError:
            pass
        
        if count >= min_count_manual and count < min_count:
            print(token + " is candidate")
            unit_conversion_dict[(token, to_unit, False)] = 'undefined'
            
        
        #if count / lines < min_occurrence_ratio:
        if count < min_count:
            break
        
        data_curr = data[data['InfoTokenAfter'] == token]
        
        #print(count)
        
        reg = LinearRegression(fit_intercept=False)
        reg.fit(data_curr['InfoAbstractNumber'].values.reshape(-1, 1), data_curr['InfoFactNumber'])
        score = reg.score(data_curr['InfoAbstractNumber'].values.reshape(-1, 1), data_curr['InfoFactNumber'])
        
        if score >= min_r2:
            
            x_values = pd.to_numeric(data_curr['InfoAbstractNumber']).values
            y_values_true = pd.to_numeric(data_curr['InfoFactNumber']).values
            y_values_predicted = reg.predict(data_curr['InfoAbstractNumber'].values.reshape(-1, 1))
            
            reg, score = improve_regression(reg, score, x_values, y_values_true)
            
            if score < min_final_r2:
                print('Discarded ' + token)
                continue
            
            if reg.coef_ >= 0.95 and reg.coef_ <= 1.05:
                print('Discarded equivalent unit: ' + token)
                continue
            
            print(token + ' -> ' + to_unit)
            print(count)
            print(score)
            #print(reg.predict(np.array([1]).reshape(-1, 1)))
            print(reg.coef_)
            
            
            
            
            #print(x_values)
            
            #print(y_values_true)
            #print(len(y_values_true))
            #print(y_values_predicted)
            #print(len(y_values_predicted))
            
            
            mean = np.mean(x_values)
            ssq = ((x_values - mean) ** 2).sum()
            residual_ssq = ((y_values_true - y_values_predicted) ** 2).sum()
            st_deviation = sqrt(residual_ssq / (count - 2))
            t_value = stats.t.ppf(0.95, count)
            
            #print((y_values_true - y_values_predicted) ** 2 / st_deviation)
            
            #print("DATA:")
            #print('alpha = 0 (harcoded)')
            #print('beta = ' + str(reg.coef_[0]))
            #print('q = ' + str(t_value))
            #print('n = ' + str(count))
            #print('residuals_ssq = ' + str(residual_ssq))
            #print('sigma^ = ' + str(st_deviation))
            #print('x_ = ' + str(mean))
            #print('SSQ = ' + str(ssq))
            
            #print('Interval for 500:')
            
            #lower_bound = reg.coef_[0] * 500 - t_value * st_deviation * sqrt(1 + 1 / count + (500 - mean) ** 2 / ssq)
            #upper_bound = reg.coef_[0] * 500 + t_value * st_deviation * sqrt(1 + 1 / count + (500 - mean) ** 2 / ssq)
            
            #print('[' + str(lower_bound) + ',' + str(upper_bound) + ']')
            
            #if token != 'km²' and token != 'km2'  and token != 'ha':
            #a = 1/0
            print('---')
            
            if ( overwrite_ground_truth_hard ) \
            or ( insert_ground_truth and (token, to_unit, False) not in unit_conversion_dict ) \
            or ( overwrite_ground_truth and score > unit_conversion_dict[(token, to_unit, False)][1] ):
                unit_conversion_dict[(token, to_unit, False)] = (reg.coef_[0], score)
                unit_conversion_dict[(to_unit, token, False)] = (1 / reg.coef_[0], score)
                
            
            #print(reg.intercept_)
        
        #reg.fit ([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
        
if insert_ground_truth or overwrite_ground_truth or overwrite_ground_truth_hard:
    save_object(unit_conversion_dict, 'unit_conversion_dict')
print("Finished")

--- acceleration.csv---
--- apoapsis.csv---
Abort: Score decreased
Discarded years
--- approximateCalories.csv---
--- area.csv---
Abort: Factor did not change any more
km² -> m2
137
1.0
[1000000.]
---
Abort: Too less instances
km2 -> m2
107
0.9999999999933139
[999999.99856751]
---
Abort: Score decreased
ha -> m2
101
0.9998398975344222
[9989.34047494]
---
square has declared to be no unit and will be excluded.
--- areaLand.csv---
Abort: Score decreased
Discarded by
Abort: Perfect fit
km2 -> m2
470
1.0
[1000000.]
---
square has declared to be no unit and will be excluded.
Abort: Perfect fit
km² -> m2
189
1.0
[1000000.]
---
--- areaMetro.csv---
--- areaOfCatchment.csv---
Abort: Score decreased
km2 -> m2
113
0.9531199342294102
[7381943.97668456]
---
square has declared to be no unit and will be excluded.
--- areaRural.csv---
--- areaTotal.csv---
square has declared to be no unit and will be excluded.
Abort: Perfect fit
km2 -> m2
4304
1.0
[1000000.]
---
Abort: Factor did not change any more

Abort: Score decreased
Discarded tests
--- wheelbase.csv---
--- width.csv---
Abort: Score decreased
km2 -> μ
272
0.8703676913573761
[0.64218346]
---
Abort: Score decreased
Discarded square
Abort: Score decreased
Discarded mm
Abort: Score decreased
Discarded hectares
Finished


In [330]:
save_object(unit_conversion_dict, 'unit_conversion_dict')

In [15]:
def refresh_unit_conversion_dict():
    new_dict = dict()
    save_object(new_dict, 'unit_conversion_dict')
#refresh_unit_conversion_dict()

In [323]:
def improve_regression(reg, score, x_values, y_values_true):
    
    y_values_predicted = reg.predict(x_values.reshape(-1, 1))
    residuals = (y_values_true - y_values_predicted) ** 2
    
    rof = outlier_filter(residuals)
    
    reg_new = LinearRegression(fit_intercept=False)
    x_values_new = x_values[rof]
    y_values_true_new = y_values_true[rof]
    
    reg_new.fit(x_values_new.reshape(-1, 1), y_values_true_new)
    score_new = reg_new.score(x_values_new.reshape(-1, 1), y_values_true_new)
    
    
    if len(x_values_new) < min_instances:
        print('Abort: Too less instances')
    elif score_new + 0.05 <= score:
        print('Abort: Score decreased')
    elif reg.coef_ == reg_new.coef_:
        print('Abort: Factor did not change any more')
    elif score == 1.0:
        print('Abort: Perfect fit')
    
    
    if len(x_values_new) < min_instances or score_new + 0.05 <= score or reg.coef_ == reg_new.coef_ or score == 1.0:
        return (reg, score)
    
    return improve_regression(reg_new, score_new, x_values_new, y_values_true_new)

In [126]:
def outlier_filter(input_array):
    
    median = np.median(input_array)
    iqr = stats.iqr(input_array)
    
    return (input_array >= median - 1.5 * iqr) & (input_array <= median + 1.5 * iqr)

In [248]:
residuals = (y_values_true - y_values_predicted) ** 2

In [249]:
outlier_filter(residuals)

array([ True, False, False, False,  True,  True,  True,  True, False,
        True, False, False,  True, False,  True,  True,  True,  True,
       False, False,  True,  True, False,  True,  True, False,  True,
        True,  True, False, False, False, False, False, False,  True,
        True,  True, False, False, False,  True,  True, False, False,
        True, False,  True,  True,  True,  True, False, False, False,
        True, False,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False,  True, False,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True, False, False,
       False,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,

In [250]:
rmo = outlier_filter(residuals)

In [252]:
reg = LinearRegression()
reg.fit(x_values[rmo].reshape(-1, 1), y_values_true[rmo])
reg.score(x_values[rmo].reshape(-1, 1), y_values_true[rmo])

0.9997625102500736

In [253]:
reg.coef_

array([1001361.48218921])

In [254]:
reg.intercept_

-70467914.77292633

In [255]:
y_values_predicted_rmo = reg.predict(x_values[rmo].reshape(-1, 1))
residuals = (y_values_true[rmo] - y_values_predicted_rmo) ** 2

In [256]:
outlier_filter(residuals)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True])

In [257]:
rmob = outlier_filter(residuals)

In [258]:
reg = LinearRegression()
reg.fit(x_values[rmo][rmob].reshape(-1, 1), y_values_true[rmo][rmob])
reg.score(x_values[rmo][rmob].reshape(-1, 1), y_values_true[rmo][rmob])

0.9999984503069839

In [259]:
reg.coef_

array([1000278.31790677])

In [260]:
reg.intercept_

-12959909.116361618

In [261]:
y_values_predicted_rmob = reg.predict(x_values[rmo][rmob].reshape(-1, 1))
residuals = (y_values_true[rmo][rmob] - y_values_predicted_rmob) ** 2

In [262]:
outlier_filter(residuals)

array([ True,  True,  True,  True,  True, False, False,  True,  True,
        True,  True, False,  True, False, False, False,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True])

In [263]:
rmoc = outlier_filter(residuals)

In [265]:
reg = LinearRegression()
reg.fit(x_values[rmo][rmob][rmoc].reshape(-1, 1), y_values_true[rmo][rmob][rmoc])
reg.score(x_values[rmo][rmob][rmoc].reshape(-1, 1), y_values_true[rmo][rmob][rmoc])

0.999999950126452

In [266]:
reg.coef_

array([1000049.2451018])

In [267]:
reg.intercept_

-2407806.4505233765

In [268]:
y_values_predicted_rmoc = reg.predict(x_values[rmo][rmob][rmoc].reshape(-1, 1))
residuals = (y_values_true[rmo][rmob][rmoc] - y_values_predicted_rmoc) ** 2

In [269]:
outlier_filter(residuals)

array([ True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [270]:
rmod = outlier_filter(residuals)

In [271]:
reg = LinearRegression()
reg.fit(x_values[rmo][rmob][rmoc][rmod].reshape(-1, 1), y_values_true[rmo][rmob][rmoc][rmod])
reg.score(x_values[rmo][rmob][rmoc][rmod].reshape(-1, 1), y_values_true[rmo][rmob][rmoc][rmod])

0.9999999985557569

In [272]:
reg.coef_

array([1000008.67338711])

In [273]:
reg.intercept_

-405039.3541069031

In [274]:
y_values_predicted_rmod = reg.predict(x_values[rmo][rmob][rmoc][rmod].reshape(-1, 1))
residuals = (y_values_true[rmo][rmob][rmoc][rmod] - y_values_predicted_rmod) ** 2

In [275]:
outlier_filter(residuals)

array([ True, False,  True,  True,  True, False,  True, False,  True,
       False, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [276]:
rmoe = outlier_filter(residuals)

In [277]:
reg = LinearRegression()
reg.fit(x_values[rmo][rmob][rmoc][rmod][rmoe].reshape(-1, 1), y_values_true[rmo][rmob][rmoc][rmod][rmoe])
reg.score(x_values[rmo][rmob][rmoc][rmod][rmoe].reshape(-1, 1), y_values_true[rmo][rmob][rmoc][rmod][rmoe])

0.9999999999993812

In [278]:
reg.coef_

array([1000000.15342646])

In [279]:
reg.intercept_

-7168.901363372803

In [280]:
y_values_predicted_rmoe = reg.predict(x_values[rmo][rmob][rmoc][rmod][rmoe].reshape(-1, 1))
residuals = (y_values_true[rmo][rmob][rmoc][rmod][rmoe] - y_values_predicted_rmoe) ** 2

In [281]:
residuals

array([6.86124507e+09, 3.94786842e+09, 5.13730426e+07, 5.13778593e+07,
       5.30533219e+09, 4.29705885e+07, 3.56632091e+07, 5.11661588e+07,
       4.95732246e+07, 4.01189806e+07, 4.44835510e+04, 2.41956005e+07,
       4.96894626e+07, 5.13116985e+07, 9.06301268e+06, 1.21747596e+06,
       2.59568980e+05, 3.81187284e+06, 5.13678082e+07, 5.16053078e+07,
       7.32238754e+05, 8.68609962e+06, 3.11074881e+07, 4.76537110e+07,
       2.97617676e+07, 5.14173655e+06, 2.09556583e+07, 5.12966430e+07,
       1.79996424e+07, 3.71480030e+07, 1.49651009e+06, 7.28396093e+05,
       1.03783552e+07, 2.88943454e+07, 3.13303736e+07, 3.75756421e+07,
       3.02880408e+07, 2.80153840e+07, 5.13676323e+07, 1.54587095e+07,
       4.54182627e+07, 4.71930497e+07, 3.26440593e+07, 1.96746274e+07,
       1.06279619e+07, 3.89938671e+07, 3.10561659e+07, 3.55679835e+07,
       3.30379580e+07, 3.21817163e+07, 2.59859763e+07, 4.75372781e+07,
       3.17491317e+07, 4.84663201e+07, 3.47926597e+07])

In [282]:
rmof = outlier_filter(residuals)

In [283]:
reg = LinearRegression()
reg.fit(x_values[rmo][rmob][rmoc][rmod][rmoe][rmof].reshape(-1, 1), y_values_true[rmo][rmob][rmoc][rmod][rmoe][rmof])
reg.score(x_values[rmo][rmob][rmoc][rmod][rmoe][rmof].reshape(-1, 1), y_values_true[rmo][rmob][rmoc][rmod][rmoe][rmof])

1.0

In [284]:
reg.coef_

array([1000000.])

In [285]:
reg.intercept_

0.0

In [287]:
y_values_predicted_rmof = reg.predict(x_values[rmo][rmob][rmoc][rmod][rmoe][rmof].reshape(-1, 1))
residuals = (y_values_true[rmo][rmob][rmoc][rmod][rmoe][rmof] - y_values_predicted_rmof) ** 2

In [288]:
rmog = outlier_filter(residuals)

In [289]:
reg = LinearRegression()
reg.fit(x_values[rmo][rmob][rmoc][rmod][rmoe][rmof][rmog].reshape(-1, 1), y_values_true[rmo][rmob][rmoc][rmod][rmoe][rmof][rmog])
reg.score(x_values[rmo][rmob][rmoc][rmod][rmoe][rmof][rmog].reshape(-1, 1), y_values_true[rmo][rmob][rmoc][rmod][rmoe][rmof][rmog])

1.0

In [290]:
reg.coef_

array([1000000.])

In [291]:
reg.intercept_

0.0

In [206]:
y_values_predicted_rmog = reg.predict(x_values[rmo][rmob][rmoc][rmod][rmoe][rmof][rmog].reshape(-1, 1))
residuals = (y_values_true[rmo][rmob][rmoc][rmod][rmoe][rmof][rmog] - y_values_predicted_rmog) ** 2

In [207]:
outlier_filter(residuals)

array([ True,  True,  True, False, False,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True])

In [208]:
rmoh = outlier_filter(residuals)

In [209]:
reg = LinearRegression(fit_intercept=False)
reg.fit(x_values[rmo][rmob][rmoc][rmod][rmoe][rmof][rmog][rmoh].reshape(-1, 1), y_values_true[rmo][rmob][rmoc][rmod][rmoe][rmof][rmog][rmoh])
reg.score(x_values[rmo][rmob][rmoc][rmod][rmoe][rmof][rmog][rmoh].reshape(-1, 1), y_values_true[rmo][rmob][rmoc][rmod][rmoe][rmof][rmog][rmoh])

1.0

In [210]:
reg.coef_

array([1000000.])

In [44]:
abweicher = (y_values_true - y_values_predicted) ** 2 / st_deviation

In [92]:
residuals = (y_values_true - y_values_predicted) ** 2

In [121]:
rm = residuals <= residuals.mean()

In [122]:
reg = LinearRegression(fit_intercept=False)
reg.fit(x_values[rm].reshape(-1, 1), y_values_true[rm])

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
         normalize=False)

In [123]:
reg.coef_

array([999740.34823907])

In [91]:
for i, elem in enumerate(y_values_true):
    print(str(elem) + " - " + str(y_values_predicted[i]))
    #print(str(i) + ": " + str(elem / y_values_predicted[i]))

9910000 - 9958739.443258105
1 - 208038066.96966183
1 - 195290880.48229143
1 - 707867199.6267861
14490000 - 14340584.798291672
6230000 - 6274005.849252606
9140000 - 9102287.851137908
6950000 - 6921323.913064383
1 - 189216049.421904
1 - 39834957.77303242
1 - 1559140247.2364888
1 - 340987238.5371575
1 - 119992851.55181691
1 - 155246789.18095058
1 - 8644185.836748036
1 - 2967704.3540909155
43020000 - 42922167.00044244
1 - 1852325.5364460077
1 - 4993272121.891841
62320000 - 6206286421.038451
1 - 52781319.049267955
4000000000 - 3983495777.303242
1 - 2416986062.878742
1988000000 - 1966851040.0434759
1 - 38739496.434274025
1 - 531288790.55837667
1 - 98223047.12885469
1 - 1732820.6631269103
1 - 4491391.488909406
1 - 7682141730.310973
4538000000 - 4320101170.485366
1 - 178261436.0343201
8310000 - 239009746.63819453
1 - 3634939896.7892084
7050000 - 702091130.7496964
7802000000 - 7769808513.629973
1 - 49793697.216290526
1 - 3535352.502356627
1 - 433185248.30284107
1 - 29273415831.273903
1 - 941220

In [93]:
residuals

array([2.37553333e+09, 4.32798369e+16, 3.81385276e+16, 5.01075971e+17,
       2.23249025e+10, 1.93651477e+09, 1.42220617e+09, 8.22317962e+08,
       3.58027130e+16, 1.58682378e+15, 2.43091831e+18, 1.16272296e+17,
       1.43982842e+16, 2.41015652e+16, 7.47219315e+13, 8.80726320e+12,
       9.57129580e+09, 3.43110619e+12, 2.49327665e+19, 3.77483234e+19,
       2.78586754e+15, 2.72389367e+14, 5.84182162e+18, 4.47278507e+14,
       1.50074851e+15, 2.82267778e+17, 9.64776679e+15, 3.00266398e+12,
       2.01725885e+13, 5.90153015e+19, 4.74798999e+16, 3.17771392e+16,
       5.32223731e+16, 1.32127880e+19, 4.83082173e+17, 1.03629179e+15,
       2.47941218e+15, 1.24987102e+13, 1.87649458e+17, 8.56932874e+20,
       8.85895808e+19, 1.81664811e+11, 1.18632541e+13, 2.86613316e+18,
       1.20759119e+20, 3.73260604e+14, 1.53644627e+19, 5.04181891e+14,
       3.93876727e+16, 3.66129146e+15, 1.03841807e+13, 9.43069413e+17,
       7.21617075e+17, 2.79533710e+17, 8.71854328e+15, 3.51345653e+19,
      

In [94]:
bool_arr = abweicher <= 5000
print(bool_arr)
print(len(bool_arr[bool_arr == True]))

[ True False False False  True  True  True  True False False False False
 False False False  True  True  True False False False False False False
 False False False  True  True False False False False False False False
 False  True False False False  True  True False False False False False
 False False  True False False False False False False  True False False
 False False False False False  True False False False False False False
 False False False False False False False  True False False False False
 False False False False False False False False False False False False
 False False False False False False False False False  True False False
 False False False False False False False False False False False  True
 False False False False False False False False False False  True False
 False False False False False]
20


In [115]:
awt = abweicher/x_values <= 200
print(awt)
print(len(awt[awt == True]))

[ True False False False  True  True  True  True False False False False
 False False False False  True False False False False  True False  True
 False False False False False False False False False False False  True
 False False False False False  True  True False False False False  True
  True  True  True False False False False False False  True False  True
  True  True  True False False  True False False False False  True False
 False  True  True  True  True  True  True  True False False  True  True
 False  True False  True False  True  True  True False  True  True  True
  True  True False False False False False False False  True  True False
  True  True  True  True  True False  True False False False  True False
 False  True  True  True False  True  True  True False False  True False
 False False False  True False]
58


In [103]:
for i, elem in enumerate(y_values_true):
    if awt[i] == False:
        continue
    print(str(elem) + " - " + str(y_values_predicted[i]))
    #print(str(i) + ": " + str(elem / y_values_predicted[i]))

9910000 - 9958739.443258105
14490000 - 14340584.798291672
6230000 - 6274005.849252606
9140000 - 9102287.851137908
6950000 - 6921323.913064383
43020000 - 42922167.00044244
4000000000 - 3983495777.303242
1988000000 - 1966851040.0434759
4538000000 - 4320101170.485366
8310000 - 239009746.63819453
7802000000 - 7769808513.629973
103300000 - 102873778.44885622
834770000 - 831325692.5048568
950000000000 - 946080247109.52
5442000000 - 5419546005.021061
48100000000 - 47901536722.07149
14665000000 - 14604491393.538012
781000000 - 777777550.518458
6000000000 - 6971117610.280674
37040000 - 36887170.89782802
66347000000 - 66073248584.184555
53917000000 - 53694535456.21473
50046000000 - 49839507417.729515
34000000000 - 33859714107.077557
109865000000 - 109411690893.35518
11520000 - 11472467.838633336
17350000 - 9958739.443258105
81456000000 - 80098141342.12494
93547000000 - 93161019869.8466
41148000000 - 40978221061.118454
27516000000 - 27402467452.069004
10373000000 - 10330200424.491632
1732000000 -

In [102]:
print(len(awt[awt == True]))

81


In [104]:
reg = LinearRegression(fit_intercept=False)
reg.fit(x_values[awt].reshape(-1, 1), y_values_true[awt])

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
         normalize=False)

In [105]:
reg.coef_

array([1001083.78758989])

In [106]:
reg.score(x_values[awt].reshape(-1, 1), y_values_true[awt])

0.9994306045832846

In [60]:
x_values_filtered = x_values[bool_arr]
y_values_true_filtered = y_values_true[bool_arr]
y_values_predicted_filtered = y_values_predicted[bool_arr]

In [61]:
reg = LinearRegression(fit_intercept=False)
reg.fit(x_values_filtered.reshape(-1, 1), y_values_true_filtered)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
         normalize=False)

In [62]:
reg.coef_

array([999408.17336139])

In [63]:
score = reg.score(x_values_filtered.reshape(-1, 1), y_values_true_filtered)

In [64]:
score

0.9999881136007207

In [65]:
reg.predict(x_values_filtered.reshape(-1, 1))

array([ 9994081.7336139 , 14391477.69640401,  6296271.49217676,
        9134590.7045231 ,  6945886.80486166, 43074492.2718759 ,
       37018078.74130588, 11513182.15712321, 43864024.7288314 ,
       11593134.81099212])

In [66]:
x_values_filtered

array([10.  , 14.4 ,  6.3 ,  9.14,  6.95, 43.1 , 37.04, 11.52, 43.89,
       11.6 ])