<a href="https://colab.research.google.com/github/AnzhelaSukhanova/ml_tasks/blob/main/Comment%20prediction%20(task%201)/comm_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
import math
from random import randrange
from copy import copy
from collections import defaultdict
from statistics import mean, variance 

import pandas as pd

In [2]:
feature_num = 0
fold_num = 5
epoch_num = 80

In [3]:
!rm -rf Dataset.zip Dataset/ sample_data/

## Download data

In [4]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip
!unzip Dataset.zip > /devnull
!rm -rf __MACOSX/ > /devnull

--2021-10-11 16:15:34--  https://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19055526 (18M) [application/x-httpd-php]
Saving to: ‘Dataset.zip’


2021-10-11 16:15:35 (21.3 MB/s) - ‘Dataset.zip’ saved [19055526/19055526]



## Data normalization

In [5]:
def normalize(train_data):
    min_row = train_data.min()
    max_row = train_data.max()

    norm_data = pd.DataFrame(columns=range(feature_num))
    for index, row in train_data.iterrows():
        norm_data = norm_data.append((row - min_row)/(max_row - min_row),
                                     ignore_index=True)
    norm_data.iloc[:, -1] = train_data.iloc[:, -1]
    return norm_data

## Cross validation

In [6]:
def get_folds(train_data):
    folds = []
    ground_truth = []
    data_copy = copy(train_data)
    fold_size = int(len(train_data) / fold_num)
    
    for _ in range(fold_num):
        fold = pd.DataFrame(index=range(fold_size), columns=range(feature_num + 1))
        for i in range(fold_size):
            index = randrange(len(data_copy))
            row = data_copy.iloc[index]
            data_copy.drop(index)
            fold.iloc[i] = row

        ground_truth.append(fold.iloc[:, -1])
        folds.append(fold.iloc[:, :-1])
    return folds, ground_truth

## Prediction

In [7]:
def predict(row, weights):
    pred_res = weights[0]
    for i in range(1, len(row) + 1):
        if math.isnan(row[i - 1]):
            row.iat[i - 1] = 0
        pred_res += weights[i] * row[i - 1]
    return pred_res

## SGD

In [8]:
def sgd(folds, ground_truth, test_ind):
    weights = [0] * (feature_num + 1)
    rows_num = folds[0].shape[0]

    for i in range(fold_num):
        if i != test_ind:
            train_data = folds[i]
            gt = ground_truth[i]
            for j in range(1, epoch_num):
                error_sum = 0
                step = 1 / j
                grad = defaultdict(int)

                for k, row in train_data.iterrows():
                    pred = predict(row, weights)
                    error = pred - gt[k]
                    error_sum += error ** 2
                    grad[0] += error
                    for l in range(1, feature_num):
                        grad[l] += error * row[l - 1]

                for k in range(feature_num):
                    weights[k] = weights[k] - step * (2 / rows_num) * grad[k]
                
                print('epoch=%d, step=%.3f, mse=%.3f' % (j, step, error_sum/rows_num))

    return weights

## Stats (R^2, RMSE, mean, variance)

In [9]:
def get_stats(fold, ground_truth, weights):
    error_sum = 0
    dev_sum = 0
    gt_mean = ground_truth.mean()
    predictions = []
    
    for k, row in fold.iterrows():
        pred = predict(row, weights)
        error = pred - ground_truth[k]
        error_sum += error ** 2
        dev_sum += (ground_truth[k] - gt_mean) ** 2
        predictions.append(pred)

    n = fold.shape[0]
    R2 = 1 - (error_sum / dev_sum)
    rmse = math.sqrt(error_sum / n)
    pred_mean = mean(predictions)
    D = variance(predictions)
    return R2, rmse, pred_mean, D

## Style

In [10]:
def highlight_cells(x):
    df = x.copy()
    df.loc[:, :] = ''
    for i in range(x.shape[0]):
        df.iloc[i, i] = 'background-color: lightgreen'
    return df 

## Execution

In [11]:
def main():
    global feature_num
    train_data = pd.read_csv('Dataset/Training/Features_Variant_1.csv', header=None)
    feature_num = train_data.shape[1] - 1
    train_data = normalize(train_data)
    print('Normalization: done')
    folds, ground_truth = get_folds(train_data)
    print('Cross validation: done')

    stats = pd.DataFrame(index=range(5), columns=['F1', 'F2', 'F3', 'F4', 'F5', 'weights'])
    for i in range(fold_num):
        weights = sgd(folds, ground_truth, i)
        w_str = str(weights[0])
        for j in range(1, len(weights)):
            w_str += '\n' + str(weights[j])
        stats.iloc[i, 5] = w_str
        for j in range(fold_num):
            fold_stats = 'Train:\n' if j != i else 'Test:\n'
            R2, rmse, mean, D = get_stats(folds[j], ground_truth[j], weights)
            fold_stats += 'R^2: ' + str(R2) + '\n'
            fold_stats += 'RMSE: ' + str(rmse) + '\n'
            fold_stats += 'Mean: ' + str(mean) + '\n'
            fold_stats += 'Variance: ' + str(D)
            stats.iloc[i, j] = fold_stats

    table = stats.style.apply(highlight_cells, axis=None)
    return table

In [12]:
table = main()
table

Normalization: done
Cross validation: done
epoch=1, step=1.000, mse=1361.829
epoch=2, step=0.500, mse=2689.728
epoch=3, step=0.333, mse=7491.999
epoch=4, step=0.250, mse=8032.693
epoch=5, step=0.200, mse=3112.012
epoch=6, step=0.167, mse=1264.958
epoch=7, step=0.143, mse=1162.336
epoch=8, step=0.125, mse=1159.248
epoch=9, step=0.111, mse=1156.723
epoch=10, step=0.100, mse=1154.578
epoch=11, step=0.091, mse=1152.719
epoch=12, step=0.083, mse=1151.085
epoch=13, step=0.077, mse=1149.631
epoch=14, step=0.071, mse=1148.323
epoch=15, step=0.067, mse=1147.137
epoch=16, step=0.062, mse=1146.053
epoch=17, step=0.059, mse=1145.056
epoch=18, step=0.056, mse=1144.135
epoch=19, step=0.053, mse=1143.280
epoch=20, step=0.050, mse=1142.481
epoch=21, step=0.048, mse=1141.733
epoch=22, step=0.045, mse=1141.031
epoch=23, step=0.043, mse=1140.368
epoch=24, step=0.042, mse=1139.742
epoch=25, step=0.040, mse=1139.149
epoch=26, step=0.038, mse=1138.585
epoch=27, step=0.037, mse=1138.048
epoch=28, step=0.036,

Unnamed: 0,F1,F2,F3,F4,F5,weights
0,Test: R^2: 0.17831700008821905 RMSE: 28.189511237873976 Mean: 7.479844217925813 Variance: 158.1739220656553,Train: R^2: 0.19198717790949293 RMSE: 32.53294640393711 Mean: 7.373813687734489 Variance: 161.52577286035304,Train: R^2: 0.18627013457647412 RMSE: 28.217937852166447 Mean: 7.549797813558092 Variance: 163.9584224617811,Train: R^2: 0.16958677778236042 RMSE: 32.031521883921606 Mean: 7.585399869923961 Variance: 159.48216369738594,Train: R^2: 0.16424294225050828 RMSE: 34.11187934470486 Mean: 7.562813357875624 Variance: 162.23561235073356,1.7525708668189044 0.30759867066235247 -3.1591080285733306 1.1542456830254662 0.12953194596433207 1.2713639391433653 4.163515386760366 5.99668320483745 5.625734182342655 9.673323239885477 0.6799587847873819 1.1084496542984297 3.4653185521381986 2.5815164476399506 9.710309127017705 0.04011413765890877 2.921733697105625 10.853077030434184 6.085474775399844 9.203049765102236 1.466620175768555 2.475618731903639 6.646753223259847 6.295056173193515 8.438220018449398 0.6194151313834362 1.1023998613773085 1.0698398365972372 0.29861066970839667 9.765471382628151 14.731593505422095 19.495603402149936 -0.7306811034485833 15.592048561997045 12.800613773056194 -25.305592740074758 -0.10799406247563051 1.0710059458645997 0.0 3.9147987781539966 0.06355884119867795 0.10920469500224805 0.7895744528807827 1.3520518721885166 -0.19267065324138447 0.26304150114968905 -0.6321898423597522 0.37240653271572216 1.5831101980714173 0.35919256586359743 1.4088512152171726 -0.08937376232921614 0.4734425698027034 0
1,Train: R^2: 0.17476670139571915 RMSE: 28.25034571480585 Mean: 7.472378263714278 Variance: 155.55749623311686,Test: R^2: 0.18523436705732255 RMSE: 32.6686074564928 Mean: 7.368054251303628 Variance: 158.4894680340363,Train: R^2: 0.1824923286362039 RMSE: 28.283364019490215 Mean: 7.54908545482444 Variance: 161.10371756402037,Train: R^2: 0.16518662153394104 RMSE: 32.11627335697451 Mean: 7.577818798688545 Variance: 156.58696767388383,Train: R^2: 0.1600165196576735 RMSE: 34.1980224452628 Mean: 7.5621673371484555 Variance: 159.38063661963855,2.2366489398240708 0.361148527691118 -2.3551386244040606 1.1640796902167372 0.23480523367657763 0.7743172937750947 4.335177081547205 5.122046751580407 4.586650621560634 9.27142323913654 0.15155735754974822 1.5879314120050154 2.7379657396860324 1.8071046543569957 9.820136224517084 0.08052768769276175 3.7823472336056305 9.308862053038702 4.56773106483567 8.393711334727174 0.8797075802427027 2.995664075114804 5.683445242932788 5.201528299851535 8.27845105582854 0.2520293080147454 1.3718420836079281 0.7788051467853794 0.1503844099394585 9.052410473087964 14.144735268211752 17.87297899813307 0.19691204746749522 14.841798544940898 11.581173280909399 -25.327110941911265 -0.06869962353942154 0.9641157071179522 0.0 4.132069823264431 0.2816281027661295 0.2947789363518009 0.8198878460109573 1.2955656602002605 -0.30342596516576903 0.28758538218428725 -0.43937102252372123 0.14639918208551314 1.242924258757621 0.02717363193309845 1.1899758321547265 -0.15130899828740096 0.4750391053138485 0
2,Train: R^2: 0.17792075357368586 RMSE: 28.196307441064242 Mean: 7.486756500829977 Variance: 159.10035963116877,Train: R^2: 0.19178996815036276 RMSE: 32.53691628097579 Mean: 7.379299539578212 Variance: 162.3706599849393,Test: R^2: 0.18541421201789088 RMSE: 28.232774484539792 Mean: 7.553520953778883 Variance: 164.71006581901852,Train: R^2: 0.16937818538010974 RMSE: 32.03554464776141 Mean: 7.589413713931292 Variance: 160.25658272945162,Train: R^2: 0.16392574312548558 RMSE: 34.118352058102 Mean: 7.563951748499195 Variance: 163.0998453828735,1.874134802436768 0.3496589895590767 -2.6752687942018505 1.1143705768729564 0.09623297675857262 1.350419974343573 3.9325766755031952 6.174684641209339 5.8429241238340035 9.869761287531203 0.653349023476647 0.8036828630004947 3.5787438836580123 2.694728450695732 10.09746647125183 0.10034384872387393 3.341494261753139 10.96864515516269 6.040422202780367 9.437704123316273 1.5479901996161067 2.125304547109318 6.838938978236775 6.5505808412323185 8.62762886880943 0.6232937687823125 0.7235024225997413 1.1603595354612752 0.35082992226610604 10.054800705348123 14.558755915473677 18.99053275536096 -0.41341829187715634 15.297890673767172 12.39620380173778 -25.40440038073563 -0.03941781735005037 1.0844860201522764 0.0 4.026255237266335 0.11476776203953053 0.13331173457219245 0.875364979814868 1.3911193191435576 -0.21033238976948118 0.20593160318678902 -0.6360282065508462 0.4293151092700783 1.563980543439647 0.35599934343292217 1.3238836747965594 -0.12885174670950647 0.4690789857074104 0
3,Train: R^2: 0.17764065664940965 RMSE: 28.201110522043177 Mean: 7.475107099459454 Variance: 155.67290251768293,Train: R^2: 0.1897273634931934 RMSE: 32.57840799054239 Mean: 7.3699393908429744 Variance: 158.77058813953403,Train: R^2: 0.18562325911781852 RMSE: 28.22915156451053 Mean: 7.550738881085954 Variance: 161.50398234524633,Test: R^2: 0.16741432876645745 RMSE: 32.07339344504664 Mean: 7.580090877574172 Variance: 157.10403981435132,Train: R^2: 0.16289321726717332 RMSE: 34.13941311264745 Mean: 7.563420179510699 Variance: 159.5816229400331,1.7802697486940948 0.3429385927818102 -2.9057346187774535 0.9051424311786059 0.35013284468353256 1.2449198229408656 3.730737395420956 5.725395773440944 5.3708063208963654 9.114017906199317 0.6075309183531732 1.845221554665031 3.3915274468949024 2.3985111160548733 10.348253344102906 0.08363848507025624 3.331315259059295 9.800280102384184 5.3378150999755745 8.591854973661336 1.4195711869166565 2.147449009217595 6.3642486221466985 6.009111205316867 8.086679177125404 0.32090522066386135 1.724967323818631 1.2220418106070998 0.3136469392217191 9.810880288219712 14.647985145107556 19.03251271377638 -0.5117877366391571 15.426000322431552 12.430148541643835 -25.163311106974476 0.016615662755414748 0.9896172981003415 0.0 4.059057675637002 0.1584366357337354 -0.0913123700890215 0.5932909993292251 0.9324439291330602 -0.09487853391357463 0.7568992499982189 -0.47461016149763663 -0.18769740958305386 1.3153438958242654 0.26475317206808446 1.552369438968161 0.2121664285304341 0.5003397930033081 0
4,Train: R^2: 0.17725399991958735 RMSE: 28.207739538806717 Mean: 6.998293249710362 Variance: 148.3006440022681,Train: R^2: 0.18822148995021437 RMSE: 32.60866705882786 Mean: 6.901883850116444 Variance: 152.101538171298,Train: R^2: 0.18530768684139742 RMSE: 28.234620455222853 Mean: 7.074531626918811 Variance: 153.8882998659975,Train: R^2: 0.1682907368357871 RMSE: 32.0565082265541 Mean: 7.096498585367688 Variance: 149.25811630270627,Test: R^2: 0.16033355490843315 RMSE: 34.19156815057058 Mean: 7.075361061020563 Variance: 152.46446084483372,1.8800063961630906 0.3586120140209574 -2.3588739199757303 1.0723977004055723 0.02561697140349378 1.151770696901352 3.8931794517807314 5.583294825555955 5.161403215845158 9.285417971867986 0.623855446585331 0.8775812997417377 3.2713791935540937 2.340858599411838 9.50975226380807 0.10116020716324231 3.2213664624260647 9.980678636709833 5.346834434676445 8.723821873037979 1.33643801089386 2.411815675700113 6.201069656165732 5.7681734944914735 8.216034023511984 0.5337832334702959 0.8004830509915818 1.0865798387493135 0.3708961250834385 9.346023125037778 14.143275556976503 18.486825044788844 -0.34826587883196375 14.959629224081093 12.064198763657453 -24.832729679271953 -0.07029451189083771 1.0286727311780826 0.0 3.956455316275512 -0.20663893244116077 -0.014177726493828117 -0.6662571160418536 2.6346675935188215 0.6394870967941697 -0.3042820746547074 -0.20279244451847486 0.4632058858998377 1.3780393375001587 0.13558301513547905 1.7414077106485288 -0.35181688358420854 -0.8352147963161259 0
