In [13]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import scipy as sp

In [14]:
df_freelancer = pd.read_csv('input/FreeLancerT.csv')

### 1_Inlocuirea datelor lipsa cu media sau modul
- Datele numerice se inlocuiesc cu media 
    - Datele numerice lipsa se marcheaza cu NaN
- Pentru ca nu exista medie pentru date non numerice ele se inlocuiesc cu modul
    - Datele non numerice lipsa se marcheaza cu NaN
- Pentru a inlocui datele lipsa va trebui sa iteram prin fiecare coloana din setul de date
    - Daca coloana curenta este de tip numeric atunci inlocuiesc cu media
    - Daca coloana curenta este de tip string atunci inlocuiesc cu modul

In [20]:
for column in df_freelancer.columns:
    #prima conditie in for determina tipul de date | a doua conditie in for este pentru eficienta
    if (is_numeric_dtype(df_freelancer[column])) and df_freelancer[column].isna().any():
        df_freelancer[column].fillna(df_freelancer[column].mean(), inplace=True)
    else: 
        df_freelancer[column].fillna(df_freelancer[column].mode(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_freelancer[column].fillna(df_freelancer[column].mode(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_freelancer[column].fillna(df_freelancer[column].mean(), inplace=True)


In [21]:
df_freelancer

Unnamed: 0,Country,CountryCode,Continent,C,C_Test,Html,Html_test,Java,Java_test,PHP,PHP_test
0,Aruba,ABW,EU,1.918226,0.000000,4.795565,0.959113,2.877339,0.000000,3.836452,0.000000
1,Angola,AGO,AF,0.023228,0.003871,0.011614,0.000000,0.023228,0.000000,0.019357,0.000000
2,Albania,ALB,EU,7.783173,0.241072,4.593976,0.551021,6.956642,0.241072,12.397975,0.860970
3,Andorra,AND,EU,7.229090,0.000000,7.229090,1.445818,10.120726,0.000000,8.674908,1.445818
4,United Arab Emirates,ARE,AS,7.176023,0.291357,7.996140,0.291357,4.478270,0.140283,7.629246,0.151074
...,...,...,...,...,...,...,...,...,...,...,...
185,Vanuatu,VUT,OC,0.739454,0.000000,0.369727,0.000000,0.369727,0.000000,0.739454,0.000000
186,Samoa,WSM,OC,4.112624,0.000000,2.570390,0.514078,4.626702,0.000000,3.598546,0.514078
187,Yemen Rep.,YEM,AS,0.080065,0.000000,0.076426,0.007279,0.043672,0.000000,0.058229,0.003639
188,South Africa,ZAF,AF,1.109516,0.027283,1.227744,0.023645,0.836685,0.030921,1.113154,0.040015


### 2_Standardizarea datelor
- Datele sunt standardizate daca au medie 0 si abatere standard 1
- Pentru a standardiza un set de date, pentru fiecare coloana numerica avem:
    - df[col_num] = (df[col] - df[col].mean())/df[col].std()


In [22]:
for column in df_freelancer.columns:
    if (is_numeric_dtype(df_freelancer[column])):
        df_freelancer[column] = (df_freelancer[column] - df_freelancer[column].mean())/df_freelancer[column].std()

In [25]:
for column in df_freelancer.columns:
    if (is_numeric_dtype(df_freelancer[column])):
        print(f"{df_freelancer[column].mean():.2f} ~ {df_freelancer[column].std():.2f}");

-0.00 ~ 1.00
-0.00 ~ 1.00
0.00 ~ 1.00
0.00 ~ 1.00
0.00 ~ 1.00
-0.00 ~ 1.00
-0.00 ~ 1.00
0.00 ~ 1.00


Explicatii --> dupa cum se vede mai sus, toate coloanele numerice au media 0 si abaterea standarad 1 --> datele sunt standardizate

### 3_Calculul matricilor de covarianta/corelatie
- Daca matricile de covarianta/corelatie nu sunt raportate la nimic, atunci ele se calculeaza pur si simplu din valorile numerice ale dataframe-ului folosind corr sau cov

In [28]:
mat_corel = df_freelancer[df_freelancer.select_dtypes('number').columns].corr()
mat_cov = df_freelancer[df_freelancer.select_dtypes('number').columns].cov()

In [29]:
mat_corel

Unnamed: 0,C,C_Test,Html,Html_test,Java,Java_test,PHP,PHP_test
C,1.0,0.233716,0.595868,0.955766,0.926323,0.225083,0.60965,0.958705
C_Test,0.233716,1.0,0.741141,0.16383,0.392074,0.661357,0.707744,0.224505
Html,0.595868,0.741141,1.0,0.495562,0.750453,0.640734,0.962851,0.540528
Html_test,0.955766,0.16383,0.495562,1.0,0.841866,0.161197,0.485323,0.983244
Java,0.926323,0.392074,0.750453,0.841866,1.0,0.425584,0.772737,0.849931
Java_test,0.225083,0.661357,0.640734,0.161197,0.425584,1.0,0.616576,0.158408
PHP,0.60965,0.707744,0.962851,0.485323,0.772737,0.616576,1.0,0.555824
PHP_test,0.958705,0.224505,0.540528,0.983244,0.849931,0.158408,0.555824,1.0


In [30]:
mat_cov

Unnamed: 0,C,C_Test,Html,Html_test,Java,Java_test,PHP,PHP_test
C,1.0,0.233716,0.595868,0.955766,0.926323,0.225083,0.60965,0.958705
C_Test,0.233716,1.0,0.741141,0.16383,0.392074,0.661357,0.707744,0.224505
Html,0.595868,0.741141,1.0,0.495562,0.750453,0.640734,0.962851,0.540528
Html_test,0.955766,0.16383,0.495562,1.0,0.841866,0.161197,0.485323,0.983244
Java,0.926323,0.392074,0.750453,0.841866,1.0,0.425584,0.772737,0.849931
Java_test,0.225083,0.661357,0.640734,0.161197,0.425584,1.0,0.616576,0.158408
PHP,0.60965,0.707744,0.962851,0.485323,0.772737,0.616576,1.0,0.555824
PHP_test,0.958705,0.224505,0.540528,0.983244,0.849931,0.158408,0.555824,1.0


### 4_Matrici de corelatie sau covarianta raportate la o valoare
- Rezultatul este un vector de matrici de lungime QxMxM
    - Q reprezinta valoare la care ne raportam 
    - M nr de variabile studiate
- In cazul de fata raportam variabilele C, C_test, Html... la fiecare tara in parte

In [43]:
country_list = df_freelancer['Country'].unique().tolist()
number_of_variables = df_freelancer.select_dtypes('number').columns.size

corelated_cov_matrix = np.empty(shape=(len(country_list), number_of_variables, number_of_variables))
corelated_corel_matrix = np.empty(shape=(len(country_list), number_of_variables, number_of_variables))

index = 0
for country in country_list:
    current_country_mask = df_freelancer['Country'] == country
    df_country_selection = df_freelancer[current_country_mask]
    df_numerical_values = df_country_selection.select_dtypes('number')

    corelated_corel_matrix[index] = np.cov(df_numerical_values)
    corelated_cov_matrix[index] = np.std(df_numerical_values)
    


  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwa

In [44]:
corelated_cov_matrix

array([[[0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
        [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
        [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
        ...,
        [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
        [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
        [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000]],

       [[1.03753786e-322, 2.23883674e-316, 6.90287758e-310, ...,
         5.51718916e-313, 1.18575755e-322, 6.90287669e-310],
        [4.94065646e-324, 4.40904182e-320, 5.72938873e-313, ...,
         2.23884148e-316, 6.90287669e-

In [45]:
corelated_corel_matrix

array([[[3.32121801e-002, 3.32121801e-002, 3.32121801e-002, ...,
         3.32121801e-002, 3.32121801e-002, 3.32121801e-002],
        [3.32121801e-002, 3.32121801e-002, 3.32121801e-002, ...,
         3.32121801e-002, 3.32121801e-002, 3.32121801e-002],
        [3.32121801e-002, 3.32121801e-002, 3.32121801e-002, ...,
         3.32121801e-002, 3.32121801e-002, 3.32121801e-002],
        ...,
        [3.32121801e-002, 3.32121801e-002, 3.32121801e-002, ...,
         3.32121801e-002, 3.32121801e-002, 3.32121801e-002],
        [3.32121801e-002, 3.32121801e-002, 3.32121801e-002, ...,
         3.32121801e-002, 3.32121801e-002, 3.32121801e-002],
        [3.32121801e-002, 3.32121801e-002, 3.32121801e-002, ...,
         3.32121801e-002, 3.32121801e-002, 3.32121801e-002]],

       [[6.90278424e-310, 2.17573294e-316, 2.33419537e-313, ...,
         6.90278318e-310, 6.90278318e-310, 4.24399158e-314],
        [6.90278424e-310, 2.17573294e-316, 1.27319747e-313, ...,
         2.17573269e-316, 2.17573269e-