<a href="https://colab.research.google.com/github/AnzhelaSukhanova/ml_tasks/blob/main/Comment%20prediction%20(task%201)/comm_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
import math
from random import randrange
from copy import copy
from collections import defaultdict
from statistics import mean, variance 

import pandas as pd

In [2]:
feature_num = 0
fold_num = 5
epoch_num = 100

In [3]:
!rm -rf Dataset.zip Dataset/ sample_data/

## Download data

In [4]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip
!unzip Dataset.zip > /devnull
!rm -rf __MACOSX/ > /devnull

--2021-10-12 11:52:50--  https://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19055526 (18M) [application/x-httpd-php]
Saving to: ‘Dataset.zip’


2021-10-12 11:52:51 (78.6 MB/s) - ‘Dataset.zip’ saved [19055526/19055526]



## Data normalization

In [5]:
def normalize(train_data):
    min_row = train_data.min()
    max_row = train_data.max()

    norm_data = pd.DataFrame(columns=range(feature_num))
    for index, row in train_data.iterrows():
        norm_data = norm_data.append((row - min_row)/(max_row - min_row),
                                     ignore_index=True)
    norm_data.iloc[:, -1] = train_data.iloc[:, -1]
    return norm_data

## Cross validation

In [6]:
def get_folds(train_data):
    folds = []
    ground_truth = []
    data_copy = copy(train_data)
    fold_size = int(len(train_data) / fold_num)
    
    for _ in range(fold_num):
        fold = pd.DataFrame(index=range(fold_size), columns=range(feature_num + 1))
        for i in range(fold_size):
            index = randrange(len(data_copy))
            row = data_copy.iloc[index]
            data_copy.drop(index)
            fold.iloc[i] = row

        ground_truth.append(fold.iloc[:, -1])
        folds.append(fold.iloc[:, :-1])
    return folds, ground_truth

## Prediction

In [7]:
def predict(row, weights):
    pred_res = weights[0]
    for i in range(1, len(row) + 1):
        if math.isnan(row[i - 1]):
            row.iat[i - 1] = 0
        pred_res += weights[i] * row[i - 1]
    return pred_res

## SGD

In [8]:
def sgd(folds, ground_truth, test_ind):
    weights = [0] * (feature_num + 1)
    rows_num = folds[0].shape[0]

    for i in range(fold_num):
        if i != test_ind:
            train_data = folds[i]
            gt = ground_truth[i]
            for j in range(1, epoch_num):
                error_sum = 0
                step = 1 / j
                grad = defaultdict(int)

                for k, row in train_data.iterrows():
                    pred = predict(row, weights)
                    error = pred - gt[k]
                    error_sum += error ** 2
                    grad[0] += error
                    for l in range(1, feature_num):
                        grad[l] += error * row[l - 1]

                for k in range(feature_num):
                    weights[k] = weights[k] - step * (2 / rows_num) * grad[k]
                
                # print('epoch=%d, step=%.3f, mse=%.3f' % (j, step, error_sum/rows_num))

    return weights

## Stats (R^2, RMSE, mean, variance)

In [9]:
def get_stats(fold, ground_truth, weights):
    error_sum = 0
    dev_sum = 0
    gt_mean = ground_truth.mean()
    errors = []
    
    for k, row in fold.iterrows():
        pred = predict(row, weights)
        error = pred - ground_truth[k]
        error_sum += error ** 2
        dev_sum += (ground_truth[k] - gt_mean) ** 2
        errors.append(error)

    n = fold.shape[0]
    R2 = 1 - (error_sum / dev_sum)
    rmse = math.sqrt(error_sum / n)
    err_mean = mean(errors)
    D = variance(errors)
    return R2, rmse, err_mean, D

## Style

In [10]:
def highlight_cells(x):
    df = x.copy()
    df.loc[:, :] = ''
    for i in range(1, x.shape[0], 2):
        df.iloc[i, i//2] = 'background-color: lightgreen'
    return df 

## Execution

In [11]:
def main():
    global feature_num
    train_data = pd.read_csv('Dataset/Training/Features_Variant_1.csv', 
                             header=None)
    feature_num = train_data.shape[1] - 1
    train_data = normalize(train_data)
    print('Normalization: done')
    folds, ground_truth = get_folds(train_data)
    print('Cross validation: done')

    stats = pd.DataFrame(index=[1, ' ', 2, '  ', 3, '   ', 4, '    ', 5, ''], 
                         columns=['F1', 'F2', 'F3', 'F4', 'F5', 'Weights'])
    for i in range(1, fold_num*2, 2):
        weights = sgd(folds, ground_truth, i // 2)
        w_str = str(weights[0])
        for j in range(1, len(weights)):
            w_str += '\n' + str(weights[j])
        stats.iloc[i, 5] = w_str
        stats.iloc[i - 1, 5] = ''
        for j in range(fold_num):
            stats.iloc[i - 1, j] = 'Train' if j != i//2 else 'Test'
            R2, rmse, mean, D = get_stats(folds[j], ground_truth[j], weights)
            fold_stats = 'R^2: ' + str(R2)
            fold_stats += '\nRMSE: ' + str(rmse)
            fold_stats += '\nMean of errors: ' + str(mean)
            fold_stats += '\nVariance of errors: ' + str(D)
            stats.iloc[i, j] = fold_stats

    table = stats.style.apply(highlight_cells, axis=None)
    return table

In [12]:
table = main()
table

Normalization: done
Cross validation: done


Unnamed: 0,F1,F2,F3,F4,F5,Weights
1.0,Test,Train,Train,Train,Train,
,R^2: 0.18804425206476028 RMSE: 32.55781722654627 Mean of errors: 1.1857970924031316 Variance of errors: 1058.7346352270815,R^2: 0.17849769695584206 RMSE: 30.793917276268022 Mean of errors: 1.2187864849192878 Variance of errors: 946.8955309002166,R^2: 0.20508067396300056 RMSE: 29.845144186414636 Mean of errors: 1.1074130213932292 Variance of errors: 889.6149032606288,R^2: 0.1751336791041863 RMSE: 29.694569655730557 Mean of errors: 1.1397963655103258 Variance of errors: 880.5758628341886,R^2: 0.16951255464381265 RMSE: 39.906754534587954 Mean of errors: 0.06935129259288098 Variance of errors: 1592.7387452254884,1.6894550257432868 0.31098717747272914 -3.337579746732746 1.1297215621810321 -1.3459584458883955 1.4016596622917974 3.4494211595656306 6.612793926594696 5.9679787175040095 11.774280502600156 0.348579995578554 2.5050754800920854 4.012628886154855 2.9617147089354154 13.926538148016476 0.07527198329978675 2.279880837092327 10.600312683816842 5.51503737773802 9.38135696019797 1.5595179367949794 1.5934851172780418 7.35605612849957 6.686485387525941 10.55865510453869 0.7662574742674856 2.3067368620224675 1.6087771936472133 0.6220662890213612 11.981249679445929 16.390666197401444 21.518698405319896 -0.9192547865772341 17.36325937465086 14.08434643897895 -27.240442541703153 -0.07872276519781748 0.7110277373972381 0.0 4.163752176208589 -1.1188215770214345 0.3486573339820012 -1.6288065620415346 3.8932124558799868 0.48476276474109464 0.6041112663802901 -0.8936606561771709 -0.5708270966777761 1.4288208258027013 -0.7111430484348993 4.282054175048928 0.4518784702675108 -0.010907407243756264 0
2.0,Train,Test,Train,Train,Train,
,R^2: 0.1903704819562847 RMSE: 32.51114516853501 Mean of errors: 1.1853871424925275 Variance of errors: 1055.6983341282098,R^2: 0.17904497118526397 RMSE: 30.783658312302382 Mean of errors: 1.2102243887744488 Variance of errors: 946.2845315835965,R^2: 0.20672593622518554 RMSE: 29.81424260835675 Mean of errors: 1.1022702910833513 Variance of errors: 887.7824741009041,R^2: 0.1757607080262742 RMSE: 29.68328122432619 Mean of errors: 1.133825439854037 Variance of errors: 879.9190754607466,R^2: 0.1703902097777269 RMSE: 39.88566232754762 Mean of errors: 0.06755805849961763 Variance of errors: 1591.0557870448145,1.480556256200764 0.39636891449897205 -3.700678280403426 1.2261821756977764 -1.1561599808094905 1.3293489077535297 3.2232615442039343 7.01676310438051 6.580051779828184 11.828720910918689 0.37125167717077656 2.4163103739574425 3.977209007742076 2.8566764812666223 13.554701220030166 -0.009033944075105582 2.4489505333198043 11.991417125513372 7.278542273657364 9.759957932165179 1.5182879536227012 1.2190165903986048 7.744202891425649 7.34147676990162 10.346405576306767 0.7017388972860134 2.2212661812346908 1.2708582498927592 0.16940206525705243 12.05684860333373 16.252331169985172 21.768487521852315 -1.1879409326609678 17.183155443810875 14.27743703942596 -27.164673478469087 -0.13082813113394096 0.6785489654888351 0.0 4.2146248119005625 -1.4967851366004494 0.21032193817118172 -1.50242716015186 4.181691694445241 0.6770981256081812 0.5615020046312766 -1.1508452099029909 -0.2815061436812867 1.9024373494811522 -0.28361968648357994 4.4270401758280515 0.36668619518477985 -0.13829137483085066 0
3.0,Train,Train,Test,Train,Train,
,R^2: 0.18899375033675236 RMSE: 32.53877515798212 Mean of errors: 1.1888719197819968 Variance of errors: 1057.4876074734689,R^2: 0.1787500896198675 RMSE: 30.789186458200877 Mean of errors: 1.2180313625537165 Variance of errors: 946.6059971796731,R^2: 0.20512979310758017 RMSE: 29.844222086176906 Mean of errors: 1.1080820389249026 Variance of errors: 889.5583745737588,R^2: 0.17522911994733514 RMSE: 29.69285170660552 Mean of errors: 1.1370606667777927 Variance of errors: 880.4800553609856,R^2: 0.16960731258171957 RMSE: 39.90447780562502 Mean of errors: 0.07093980294113504 Variance of errors: 1592.556791608109,1.7563760178685834 0.3603114924583797 -3.402498649596315 1.1487494441164179 -1.3031623109207582 1.2965174391981726 3.0813005085398024 6.707867879858905 6.1180831698936835 11.77707045079182 0.2902117493845628 2.2962642398221136 3.8430161834087837 2.782567885575494 13.498538645993222 0.04343789393924803 3.266622179831549 11.129946033755433 5.779028200294849 9.628566504194264 1.4536122243827976 1.1415559738474141 7.439656407605346 6.91714701165984 10.474705805905902 0.1259635116039228 2.264404014213791 1.3538484319060924 0.46651636252393075 11.892598489969062 16.035989327518454 21.788041751919938 -1.6088974447486533 17.032947370960446 14.594725353112334 -27.275347817521208 -0.1248978124167322 0.7270480863400233 0.0 4.23820433319604 -1.2821614681760163 0.3964578473544291 -1.3668128029853526 4.13307606007794 0.5308443058291916 0.5091605002520571 -1.16418842448388 -0.427917994262456 1.6974195053592303 -0.660372478369776 4.076247793405072 0.17894262797779076 -0.192202602812917 0
4.0,Train,Train,Train,Test,Train,
,R^2: 0.19162090260522913 RMSE: 32.48602978254712 Mean of errors: 1.180209016697568 Variance of errors: 1054.0779564732102,R^2: 0.18034725697406584 RMSE: 30.759232472661868 Mean of errors: 1.2070814929391362 Variance of errors: 944.7887094804912,R^2: 0.2083569103098386 RMSE: 29.783577748043992 Mean of errors: 1.0969923807737396 Variance of errors: 885.9663009938223,R^2: 0.176381665562362 RMSE: 29.672097863983065 Mean of errors: 1.1294646107678223 Variance of errors: 879.2650728256365,R^2: 0.17121009994758485 RMSE: 39.865948272504 Mean of errors: 0.0680394691379666 Variance of errors: 1589.4833021014751,1.3757994360698813 0.3855496043338058 -3.180528687697844 1.2544979460937506 -1.235543933338862 1.490821166885485 3.2490738907775176 7.190651931138276 6.765570482069379 12.11561277923858 0.36870181049183964 1.6075707678545528 4.009471959204996 2.936433845319073 13.308236229168749 0.043614635447356186 2.75886868786456 12.482267761400479 7.1477737152422485 10.462398693590133 1.6638698836039791 1.1409686547026958 7.907844613069742 7.5466619744675 10.55193498715967 0.4949601967726187 1.5396319539235115 1.1889050143507258 0.16506177191706764 12.241484884982292 16.553776404179256 22.186343930640295 -1.2124340834966365 17.525330508952237 14.499479757760566 -27.31071382073187 -0.08955962929280328 0.8069165611474534 0.0 4.409049440417521 -1.3141343620292123 0.1395459170202879 -1.6863064729411965 4.140692275335177 0.8413296764451822 0.3986452918307273 -1.1439728895912362 -0.17679729438191835 1.7646890641674031 -0.3183013786580238 4.5856112011563415 0.6456647859308539 -0.09109745235293437 0
5.0,Train,Train,Train,Train,Test,
,R^2: 0.1885878196849311 RMSE: 32.54691740955344 Mean of errors: 0.017611501949507388 Variance of errors: 1059.4308951373318,R^2: 0.18607670306098623 RMSE: 30.651539050978172 Mean of errors: 0.06377398051577493 Variance of errors: 939.6275217182686,R^2: 0.20849983514243464 RMSE: 29.780889033224195 Mean of errors: -0.02876291326358009 Variance of errors: 887.0088414174056,R^2: 0.18212318818101758 RMSE: 29.568493465228674 Mean of errors: 0.04306466515124615 Variance of errors: 874.4007287111681,R^2: 0.15968252228327784 RMSE: 40.142237138581 Mean of errors: -1.1017645742370226 Variance of errors: 1610.3819691593064,1.4340693153165622 0.33294780981614563 -3.0371015848188203 0.9987689096291881 -1.2093488295200647 1.313800101876198 1.565896494291267 6.679624241717998 6.485205748612961 10.300691941663871 0.3621570316956144 0.7805333240813382 3.8615257877236706 2.852157279838227 12.326361859536506 0.04737201224857808 2.4114383149508196 11.276209290766444 6.470061162886112 9.277641289727615 1.4900494897478223 -0.37017983331959914 7.3901440273521075 7.246056734994541 9.056009352341043 0.727089875199166 0.6800616485301352 1.308725999300738 0.35537014816873214 11.105681564266934 14.99952812806864 20.276545445080462 -1.1609205458925862 15.935978511887212 13.346053451727922 -24.20054672016204 -0.0029585572593872006 0.6954320890921798 0.0 4.062038780924477 0.9874947114520395 -0.20439118204439424 -0.9477571412617021 1.0425978666513207 -0.14021311710149853 1.4864136956073475 -0.7900755179867284 0.6553698144651102 1.2380006973652067 -0.007341690438756659 0.6792550338982808 1.1331266123677202 0.751156422108842 0
