<a href="https://colab.research.google.com/github/AnzhelaSukhanova/ml_tasks/blob/main/Comment%20prediction%20(task%201)/comm_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import sys
import math
from random import randrange
from copy import copy
from collections import defaultdict
from statistics import mean, variance 

import pandas as pd

In [14]:
feature_num = 0
fold_num = 5
epoch_num = 80

In [15]:
!rm -rf Dataset.zip Dataset/ sample_data/

## Download data

In [16]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip
!unzip Dataset.zip > /devnull
!rm -rf __MACOSX/ > /devnull

--2021-10-11 18:57:48--  https://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19055526 (18M) [application/x-httpd-php]
Saving to: ‘Dataset.zip’


2021-10-11 18:57:49 (21.6 MB/s) - ‘Dataset.zip’ saved [19055526/19055526]



## Data normalization

In [17]:
def normalize(train_data):
    min_row = train_data.min()
    max_row = train_data.max()

    norm_data = pd.DataFrame(columns=range(feature_num))
    for index, row in train_data.iterrows():
        norm_data = norm_data.append((row - min_row)/(max_row - min_row),
                                     ignore_index=True)
    norm_data.iloc[:, -1] = train_data.iloc[:, -1]
    return norm_data

## Cross validation

In [18]:
def get_folds(train_data):
    folds = []
    ground_truth = []
    data_copy = copy(train_data)
    fold_size = int(len(train_data) / fold_num)
    
    for _ in range(fold_num):
        fold = pd.DataFrame(index=range(fold_size), columns=range(feature_num + 1))
        for i in range(fold_size):
            index = randrange(len(data_copy))
            row = data_copy.iloc[index]
            data_copy.drop(index)
            fold.iloc[i] = row

        ground_truth.append(fold.iloc[:, -1])
        folds.append(fold.iloc[:, :-1])
    return folds, ground_truth

## Prediction

In [19]:
def predict(row, weights):
    pred_res = weights[0]
    for i in range(1, len(row) + 1):
        if math.isnan(row[i - 1]):
            row.iat[i - 1] = 0
        pred_res += weights[i] * row[i - 1]
    return pred_res

## SGD

In [20]:
def sgd(folds, ground_truth, test_ind):
    weights = [0] * (feature_num + 1)
    rows_num = folds[0].shape[0]

    for i in range(fold_num):
        if i != test_ind:
            train_data = folds[i]
            gt = ground_truth[i]
            for j in range(1, epoch_num):
                error_sum = 0
                step = 1 / j
                grad = defaultdict(int)

                for k, row in train_data.iterrows():
                    pred = predict(row, weights)
                    error = pred - gt[k]
                    error_sum += error ** 2
                    grad[0] += error
                    for l in range(1, feature_num):
                        grad[l] += error * row[l - 1]

                for k in range(feature_num):
                    weights[k] = weights[k] - step * (2 / rows_num) * grad[k]
                
                # print('epoch=%d, step=%.3f, mse=%.3f' % (j, step, error_sum/rows_num))

    return weights

## Stats (R^2, RMSE, mean, variance)

In [21]:
def get_stats(fold, ground_truth, weights):
    error_sum = 0
    dev_sum = 0
    gt_mean = ground_truth.mean()
    predictions = []
    
    for k, row in fold.iterrows():
        pred = predict(row, weights)
        error = pred - ground_truth[k]
        error_sum += error ** 2
        dev_sum += (ground_truth[k] - gt_mean) ** 2
        predictions.append(pred)

    n = fold.shape[0]
    R2 = 1 - (error_sum / dev_sum)
    rmse = math.sqrt(error_sum / n)
    pred_mean = mean(predictions)
    D = variance(predictions)
    return R2, rmse, pred_mean, D

## Style

In [22]:
def highlight_cells(x):
    df = x.copy()
    df.loc[:, :] = ''
    for i in range(x.shape[0]):
        df.iloc[i, i] = 'background-color: lightgreen'
    return df 

## Execution

In [23]:
def main():
    global feature_num
    train_data = pd.read_csv('Dataset/Training/Features_Variant_1.csv', header=None)
    feature_num = train_data.shape[1] - 1
    train_data = normalize(train_data)
    print('Normalization: done')
    folds, ground_truth = get_folds(train_data)
    print('Cross validation: done')

    stats = pd.DataFrame(index=range(1, 6), columns=['F1', 'F2', 'F3', 'F4', 'F5', 'Weights'])
    for i in range(fold_num):
        weights = sgd(folds, ground_truth, i)
        w_str = str(weights[0])
        for j in range(1, len(weights)):
            w_str += '\n' + str(weights[j])
        stats.iloc[i, 5] = w_str
        for j in range(fold_num):
            fold_stats = 'Train:\n' if j != i else 'Test:\n'
            R2, rmse, mean, D = get_stats(folds[j], ground_truth[j], weights)
            fold_stats += 'R^2: ' + str(R2) + '\n'
            fold_stats += 'RMSE: ' + str(rmse) + '\n'
            fold_stats += 'Mean: ' + str(mean) + '\n'
            fold_stats += 'Variance: ' + str(D)
            stats.iloc[i, j] = fold_stats

    table = stats.style.apply(highlight_cells, axis=None)
    return table

In [24]:
table = main()
table

Normalization: done
Cross validation: done


Unnamed: 0,F1,F2,F3,F4,F5,Weights
1,Test: R^2: 0.1880390631388601 RMSE: 27.188052201042176 Mean: 8.059428844424763 Variance: 192.24142410645777,Train: R^2: 0.1826072897097073 RMSE: 30.86090362922966 Mean: 7.890221897398501 Variance: 186.63638467085755,Train: R^2: 0.1705221016764653 RMSE: 29.789212651818534 Mean: 7.928973585994837 Variance: 189.40703340249465,Train: R^2: 0.20737444729670207 RMSE: 25.47326975877135 Mean: 7.84631582922563 Variance: 190.82530047031076,Train: R^2: 0.1987876649551299 RMSE: 36.12219526518343 Mean: 8.102207482070105 Variance: 203.0261270042713,1.60919300164304 0.40628381579065176 -2.0092720217415 1.4361600314301892 -0.03388949098783243 1.4818542864473727 3.426859157628007 7.207102797206148 7.00336589054226 11.151428212427493 0.7235765072015209 1.7522380865415754 4.344968853875987 3.3054966336227345 12.50092246791879 0.014064416950338799 3.020992073614428 12.10872426360694 7.541615406952558 9.715908614814865 1.7160301350623797 1.8421052032352718 7.996747306151223 7.833572552546342 9.89119263299289 0.5411158816146836 1.7397579963300451 1.5909598373807075 0.7046133321995688 11.545898026575044 15.014620964595636 19.734229621030444 -0.9118836948333792 16.02836919442443 12.972189942692049 -26.14390381452325 -0.038711300022745296 0.9493952180520401 0.0 3.983577518190948 1.0713266738349816 -0.9707188203337034 0.49778433317062143 3.2264901089547395 0.8572683480360004 -1.0077465119680273 -2.0652111300514275 1.069043810685272 0.9728330876497676 0.7108084140663798 1.642496415010523 0.38126903922122307 -1.5579177450249235 0
2,Train: R^2: 0.18639273641942777 RMSE: 27.215601402455235 Mean: 8.058248039686243 Variance: 190.568825532378,Test: R^2: 0.17931891518395182 RMSE: 30.922918097309303 Mean: 7.892330510125832 Variance: 185.07809856609646,Train: R^2: 0.16807121949417814 RMSE: 29.83318971203517 Mean: 7.9266472698713 Variance: 187.43147107791577,Train: R^2: 0.20470361430768924 RMSE: 25.516151061763065 Mean: 7.843949714340863 Variance: 188.9100076729127,Train: R^2: 0.19538308461762666 RMSE: 36.198860677049495 Mean: 8.100334211342672 Variance: 200.6330742764495,1.706799775542762 0.35306210589132864 -1.340544734856339 1.2037931688231156 0.03390396512882499 1.3985052878863629 3.7711345302144155 6.744869313062746 6.368868801257779 10.890992874224686 0.6890178493189341 2.0017262415181745 3.8955247124374877 2.9194077485258534 11.399031316054923 0.043415789354003004 3.7036202875205317 11.553426713403084 6.719259428722591 9.673608283984413 1.6104462442152554 2.2663188290929166 7.475154564295053 7.176415175426436 9.636732542547557 0.2515442023412184 1.9471689173024669 1.309408448483549 0.5191660358055282 10.708498832510884 14.745879150976348 19.142360585899926 -0.6863753823524105 15.658522142167348 12.548646920218596 -26.10582165174866 0.03294909437445975 0.876193665831364 0.0 3.9263174465139845 1.012751779885002 -0.951520345877519 0.5457460889056487 3.3396581859565537 0.9036618468638539 -1.0154894025125687 -2.1280083776780736 1.3351404084577538 1.246777343302566 0.9659858647955045 1.829324838767658 0.5528376108667641 -1.388880653899877 0
3,Train: R^2: 0.18744589921627441 RMSE: 27.197981267032173 Mean: 8.060650589683734 Variance: 192.52493711348495,Train: R^2: 0.18120238549546563 RMSE: 30.88741353020958 Mean: 7.892031445892728 Variance: 186.92785699218584,Test: R^2: 0.16894291378945658 RMSE: 29.81755601983852 Mean: 7.927665822826359 Variance: 189.3439838956202,Train: R^2: 0.2059767388657544 RMSE: 25.49571952813516 Mean: 7.843927085967346 Variance: 190.75243445225894,Train: R^2: 0.1974656358769985 RMSE: 36.15198444057216 Mean: 8.10347433540304 Variance: 202.91987042437916,1.5995535815678557 0.35342233317489596 -1.8591474859859032 1.1962848690924353 0.06852997479079903 1.3717839366471112 3.8505811345118532 6.961269196868246 6.616788009602801 11.249874556070157 0.6973152736793595 1.5942322238505595 4.145654402331169 3.1661462973871135 12.079250793910983 0.022863169126111735 3.2547444503142784 11.614231792163594 6.764799326576898 9.63019076930854 1.5943790710361812 2.1460929482416677 7.734313591682256 7.50433466505445 9.96395477455058 0.3260231567532665 1.5314598512976871 1.5118038770940678 0.7404410137229338 11.295769551997173 14.765078623424344 19.607854124371286 -1.1570259877320206 15.745300377532528 13.00631222853397 -26.29669985491007 0.09866995593457285 0.8055473382896686 0.0 3.8685780560837757 0.9312246128854353 -1.124196863134115 0.37179892097870404 3.4280974731801317 1.0946460953515553 -0.9707394852264685 -2.1312771724673096 1.3321984610780548 1.385585646161863 1.0712027837677098 2.0828633126666305 0.6069091118288105 -1.5482292034700464 0
4,Train: R^2: 0.1884844007352311 RMSE: 27.18059523979699 Mean: 8.071461207416498 Variance: 193.01110799255213,Train: R^2: 0.18237110147198332 RMSE: 30.86536198558846 Mean: 7.892334632511418 Variance: 187.62416522239548,Train: R^2: 0.17088035661865653 RMSE: 29.78277891484883 Mean: 7.929359295318223 Variance: 190.0479758418761,Test: R^2: 0.20635262970062462 RMSE: 25.489683973348956 Mean: 7.846628272333335 Variance: 191.49992824671577,Train: R^2: 0.19872740970756253 RMSE: 36.12355352616044 Mean: 8.10219366894337 Variance: 203.65347602263685,1.5486530661004905 0.29875085925630496 -2.246464818832622 1.122555302426639 -0.2494480397970476 1.4767767957363032 3.6116231360900324 7.032703376613123 6.734457397777042 11.140614276795926 0.715308146130449 1.9797268846760574 4.15804856579712 3.1076060354881205 12.283646563515 0.045400598721027544 3.262212852950655 11.660452652030386 7.283250414427606 9.4463285802843 1.6941121620081365 1.8582642518669041 7.779770317089057 7.525279244539418 9.835960930041182 0.4928429065235482 1.9798586807109997 1.5084709862004635 0.5970123635983837 11.180810678105306 15.125960137868848 20.171254538290817 -1.3906072086613663 16.170096953158414 13.434955832598408 -26.247831174745507 -0.000370657593833705 0.870700696971166 0.0 3.908148053971558 0.508749511888343 -1.1060429670049905 1.0227929998661727 3.351747939037836 1.0711062731495564 -1.0403097956959146 -2.259390895140381 1.2517363887000825 1.336887786425153 1.2975538073689872 1.4146082415463124 0.21648787317138105 -1.941626373597129 0
5,Train: R^2: 0.1877189515982639 RMSE: 27.193411049756865 Mean: 6.9315865625716135 Variance: 126.52625541479294,Train: R^2: 0.1747365932061159 RMSE: 31.009127907812644 Mean: 6.815822147835828 Variance: 123.74375018569582,Train: R^2: 0.17075982615452845 RMSE: 29.784943621560625 Mean: 6.820691428650441 Variance: 125.27344896398647,Train: R^2: 0.21083829786842834 RMSE: 25.41754848504729 Mean: 6.770361120194058 Variance: 127.16661896490126,Test: R^2: 0.17767099491383476 RMSE: 36.59511590355578 Mean: 6.995067395828926 Variance: 133.3440645369636,1.9753549373236663 0.33697062448615567 -1.0532692043062937 0.8898811062851173 0.30561324895210557 0.8657146260446155 2.3538305004178417 5.1911622391351715 4.89037663356292 8.824970258206143 0.20297083941705285 1.1866142591700677 3.0190300960118486 2.18327010429763 10.064071642724896 0.04305719717587346 2.509419435206187 9.089081529983531 5.034210216038724 7.8934766563295495 0.9694956614203284 0.9392472330694409 5.729431020781953 5.440906525459687 7.835312567149502 0.4310511612578895 1.1967729921089814 1.051445800364344 0.3981459960366245 9.285987735088147 13.078071383209858 17.326865043694607 -0.8608954010786118 13.93409781054817 11.640819851656428 -23.63421078678508 0.030076207850049438 0.7891172857277434 0.0 4.089665199499364 1.0694695647000592 -0.016845632878290936 -0.04636052974456239 0.8930574089767177 -0.23925121311961492 1.2130482083044134 -0.897762868914975 0.5881910487293598 0.9999207851963341 -0.29304016548190537 0.6947451635112482 -0.6265574694003473 0.60512741650459 0
