# Importing Libraries

In [4]:
import shutil
import os
import pandas as pd
import numpy as np
import tqdm
import gc

# Reformatting Data

We are going to reformate the data we have in order to reduce memory consumed by those 3 chunks of dataframes, we are going to change the data type of each column in our dataframes whenever possible.

In [5]:
def reduce_mem_usage(df, int_cast = False, obj_to_category = False, subset = None):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    :param df: dataframe to reduce (pd.DataFrame)
    :param int_cast: indicate if columns should be tried to be casted to int (bool)
    :param obj_to_category: convert non-datetime related objects to category dtype (bool)
    :param subset: subset of columns to analyse (list)
    :return: dataset with the column dtypes adjusted (pd.DataFrame)
    """
    
    start_mem = df.memory_usage().sum() / 1024 ** 2;
    gc.collect()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    cols = df.columns
    for col in tqdm.tqdm(cols):
        col_type = df[col].dtype

        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()

            # test if column can be converted to an integer
            treat_as_int = str(col_type)[:3] == 'int'

            if treat_as_int:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                    df[col] = df[col].astype(np.uint32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                    df[col] = df[col].astype(np.uint64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif 'datetime' not in col_type.name and obj_to_category:
            df[col] = df[col].astype('category')
        
        # int to uint
        if subset:
          for col in subset:
            df[col] = df[col].astype(np.uint16)

    gc.collect()
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('\nMemory usage after optimization is: {:.3f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

### Reformate Training Data

In [6]:
x_train_1 = pd.read_csv("Data/x_train_1.csv")

In [7]:
print(x_train_1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16798783 entries, 0 to 16798782
Data columns (total 25 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   salton_for_outcoming               float64
 1   salton_for_incoming                float64
 2   source_outdegree                   int64  
 3   source_indegree                    int64  
 4   destination_outdegree              int64  
 5   destination_indegree               int64  
 6   outcoming_intersection             int64  
 7   incoming_intersection              int64  
 8   jaccard_for_outcoming              float64
 9   jaccard_for_incoming               float64
 10  sorensen_for_outcoming             float64
 11  sorensen_for_incoming              float64
 12  outcoming_preferential_attachment  int64  
 13  incoming_preferential_attachment   int64  
 14  adamic_adar                        float64
 15  outcoming_hup_promoted             float64
 16  incoming_hup_pro

In [None]:
x_train_1.drop(["source_outdegree", "destination_indegree"], axis = 1, inplace = True)

In [9]:
x_train_1_new = x_train_1.copy()

In [12]:
cols = ["source_indegree", "destination_outdegree", \
        "outcoming_intersection", "incoming_intersection"]
x_train_1_new = reduce_mem_usage(x_train_1_new, subset = cols)

Memory usage of dataframe is 2883.70 MB


100%|██████████| 23/23 [00:16<00:00,  1.39it/s]


Memory usage after optimization is: 1345.728 MB
Decreased by 53.3%





In [14]:
for i in x_train_1.columns:
  print("diffrence in term of", i, "=", np.sum(np.abs(x_train_1[i] - x_train_1_new[i])))

diffrence in term of salton_for_outcoming = 0.025889267112001332
diffrence in term of salton_for_incoming = 0.027719576455859415
diffrence in term of source_indegree = 0
diffrence in term of destination_outdegree = 0
diffrence in term of outcoming_intersection = 0
diffrence in term of incoming_intersection = 0
diffrence in term of jaccard_for_outcoming = 0.016680912073899365
diffrence in term of jaccard_for_incoming = 0.017774064990973185
diffrence in term of sorensen_for_outcoming = 0.012763284822498014
diffrence in term of sorensen_for_incoming = 0.013660582292324603
diffrence in term of outcoming_preferential_attachment = 0
diffrence in term of incoming_preferential_attachment = 0
diffrence in term of adamic_adar = 0.5354153128034917
diffrence in term of outcoming_hup_promoted = 0.029443154973710573
diffrence in term of incoming_hup_promoted = 0.03136107297636336
diffrence in term of outcoming_hup_depressed = 0.02011266525556596
diffrence in term of incoming_hup_depressed = 0.021191

In [15]:
print(x_train_1_new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16798783 entries, 0 to 16798782
Data columns (total 23 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   salton_for_outcoming               float32
 1   salton_for_incoming                float32
 2   source_indegree                    uint16 
 3   destination_outdegree              uint16 
 4   outcoming_intersection             uint16 
 5   incoming_intersection              uint16 
 6   jaccard_for_outcoming              float32
 7   jaccard_for_incoming               float32
 8   sorensen_for_outcoming             float32
 9   sorensen_for_incoming              float32
 10  outcoming_preferential_attachment  int32  
 11  incoming_preferential_attachment   int32  
 12  adamic_adar                        float32
 13  outcoming_hup_promoted             float32
 14  incoming_hup_promoted              float32
 15  outcoming_hup_depressed            float32
 16  incoming_hup_dep

In [16]:
print("The Shape of x_train_1 before reformattring =", x_train_1.shape)
print("The Shape of x_train_1 after reformattring = ", x_train_1_new.shape)

The Shape of x_train_1 before reformattring = (16798783, 23)
The Shape of x_train_1 after reformattring =  (16798783, 23)


In [17]:
x_train_1_new.to_csv("Data/x_train_1.csv", index = False)

In [14]:
x_train_2 = pd.read_csv("Data/x_train_2.csv")

In [15]:
print(x_train_2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16798783 entries, 0 to 16798782
Data columns (total 7 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   source_katz              float64
 1   source_rank              float64
 2   destination_rank         float64
 3   source_random_walk       float64
 4   destination_random_walk  float64
 5   racn                     float64
 6   destination_katz         float64
dtypes: float64(7)
memory usage: 897.2 MB
None


In [16]:
x_train_2_new = x_train_2.copy()

In [17]:
x_train_2_new = reduce_mem_usage(x_train_2_new)

Memory usage of dataframe is 897.15 MB


100%|██████████| 7/7 [00:03<00:00,  2.09it/s]


Memory usage after optimization is: 448.576 MB
Decreased by 50.0%





In [18]:
for i in x_train_2.columns:
  print("diffrence in term of", i, "=", np.abs(np.sum(x_train_2[i] - x_train_2_new[i])))

diffrence in term of source_katz = 8.573564936348028e-06
diffrence in term of source_rank = 0.00046970723161023943
diffrence in term of destination_rank = 0.00047594485565226677
diffrence in term of source_random_walk = 0.0028127744998771437
diffrence in term of destination_random_walk = 0.002511604875506432
diffrence in term of racn = 0.0005509681197164446
diffrence in term of destination_katz = 6.894042813918554e-06


In [19]:
print(x_train_2_new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16798783 entries, 0 to 16798782
Data columns (total 7 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   source_katz              float32
 1   source_rank              float32
 2   destination_rank         float32
 3   source_random_walk       float32
 4   destination_random_walk  float32
 5   racn                     float32
 6   destination_katz         float32
dtypes: float32(7)
memory usage: 448.6 MB
None


In [20]:
print("The Shape of x_train_2 before reformattring =", x_train_2.shape)
print("The Shape of x_train_2 after reformattring = ", x_train_2_new.shape)

The Shape of x_train_2 before reformattring = (16798783, 7)
The Shape of x_train_2 after reformattring =  (16798783, 7)


In [21]:
x_train_2_new.to_csv("Data/x_train_2.csv", index = False)

In [22]:
x_train_3 = pd.read_csv("Data/x_train_3.csv")

In [23]:
print(x_train_3.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16798783 entries, 0 to 16798782
Data columns (total 7 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   shortest_path_length   int64  
 1   same_community         int64  
 2   mutual_following       int64  
 3   source_weight_out      float64
 4   destination_weight_in  float64
 5   weight_multiplication  float64
 6   weight_addition        float64
dtypes: float64(4), int64(3)
memory usage: 897.2 MB
None


In [24]:
x_train_3_new = x_train_3.copy()

In [25]:
x_train_3_new = reduce_mem_usage(x_train_3_new)

Memory usage of dataframe is 897.15 MB


100%|██████████| 7/7 [00:01<00:00,  5.71it/s]



Memory usage after optimization is: 304.391 MB
Decreased by 66.1%


In [26]:
for i in x_train_3.columns:
  print("diffrence in term of", i, "=", np.abs(np.sum(x_train_3[i] - x_train_3_new[i])))

diffrence in term of shortest_path_length = 0
diffrence in term of same_community = 0
diffrence in term of mutual_following = 0
diffrence in term of source_weight_out = 0.04939325598161893
diffrence in term of destination_weight_in = 0.057083435447945134
diffrence in term of weight_multiplication = 0.0019168762941958058
diffrence in term of weight_addition = 0.05208996932512128


In [27]:
print(x_train_3_new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16798783 entries, 0 to 16798782
Data columns (total 7 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   shortest_path_length   int8   
 1   same_community         int8   
 2   mutual_following       int8   
 3   source_weight_out      float32
 4   destination_weight_in  float32
 5   weight_multiplication  float32
 6   weight_addition        float32
dtypes: float32(4), int8(3)
memory usage: 304.4 MB
None


In [28]:
print("The Shape of x_train_2 before reformattring =", x_train_3.shape)
print("The Shape of x_train_2 after reformattring = ", x_train_3_new.shape)

The Shape of x_train_2 before reformattring = (16798783, 7)
The Shape of x_train_2 after reformattring =  (16798783, 7)


In [29]:
x_train_3_new.to_csv("Data/x_train_3.csv", index = False)

### Reformate Test Data

In [18]:
x_test_1 = pd.read_csv("Data/x_test_1.csv")

In [19]:
x_test_1.drop(["source_node", "destination_node", "label"], axis = 1, inplace = True)
x_test_1.drop(["source_outdegree", "destination_indegree"], axis = 1, inplace = True)

In [20]:
x_test_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567572 entries, 0 to 1567571
Data columns (total 23 columns):
 #   Column                             Non-Null Count    Dtype  
---  ------                             --------------    -----  
 0   salton_for_outcoming               1567572 non-null  float64
 1   salton_for_incoming                1567572 non-null  float64
 2   source_indegree                    1567572 non-null  int64  
 3   destination_outdegree              1567572 non-null  int64  
 4   outcoming_intersection             1567572 non-null  int64  
 5   incoming_intersection              1567572 non-null  int64  
 6   jaccard_for_outcoming              1567572 non-null  float64
 7   jaccard_for_incoming               1567572 non-null  float64
 8   sorensen_for_outcoming             1567572 non-null  float64
 9   sorensen_for_incoming              1567572 non-null  float64
 10  outcoming_preferential_attachment  1567572 non-null  int64  
 11  incoming_preferential_at

In [21]:
x_test_1_new = x_test_1.copy()

In [22]:
cols = ["source_indegree", "destination_outdegree", \
        "outcoming_intersection", "incoming_intersection"]
x_test_1_new = reduce_mem_usage(x_test_1_new, subset = cols)

Memory usage of dataframe is 275.07 MB


100%|██████████| 23/23 [00:01<00:00, 11.79it/s]



Memory usage after optimization is: 125.576 MB
Decreased by 54.3%


In [23]:
for i in x_test_1_new.columns:
  print("diffrence in term of", i, "=", np.sum(np.abs(x_test_1[i] - x_test_1_new[i])))

diffrence in term of salton_for_outcoming = 0.0025163076529407455
diffrence in term of salton_for_incoming = 0.0026998399453208284
diffrence in term of source_indegree = 0
diffrence in term of destination_outdegree = 0
diffrence in term of outcoming_intersection = 0
diffrence in term of incoming_intersection = 0
diffrence in term of jaccard_for_outcoming = 0.0016133665586850604
diffrence in term of jaccard_for_incoming = 0.0017152857774151205
diffrence in term of sorensen_for_outcoming = 0.0012320133053812527
diffrence in term of sorensen_for_incoming = 0.0013169595609730067
diffrence in term of outcoming_preferential_attachment = 0
diffrence in term of incoming_preferential_attachment = 0
diffrence in term of adamic_adar = 0.04985121611439958
diffrence in term of outcoming_hup_promoted = 0.002623304883788916
diffrence in term of incoming_hup_promoted = 0.0027849719021239734
diffrence in term of outcoming_hup_depressed = 0.0019115864635968246
diffrence in term of incoming_hup_depressed

In [24]:
print(x_test_1_new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567572 entries, 0 to 1567571
Data columns (total 23 columns):
 #   Column                             Non-Null Count    Dtype  
---  ------                             --------------    -----  
 0   salton_for_outcoming               1567572 non-null  float32
 1   salton_for_incoming                1567572 non-null  float32
 2   source_indegree                    1567572 non-null  uint16 
 3   destination_outdegree              1567572 non-null  uint16 
 4   outcoming_intersection             1567572 non-null  uint16 
 5   incoming_intersection              1567572 non-null  uint16 
 6   jaccard_for_outcoming              1567572 non-null  float32
 7   jaccard_for_incoming               1567572 non-null  float32
 8   sorensen_for_outcoming             1567572 non-null  float32
 9   sorensen_for_incoming              1567572 non-null  float32
 10  outcoming_preferential_attachment  1567572 non-null  int32  
 11  incoming_preferential_at

In [25]:
x_test_1_new.to_csv("Data/x_test_1.csv", index = False)

In [26]:
print("The Shape of x_test_1 before before reformattring =", x_test_1.shape)
print("The Shape of x_test_1 after reformattring = ", x_test_1_new.shape)

The Shape of x_test_1 before before reformattring = (1567572, 23)
The Shape of x_test_1 after reformattring =  (1567572, 23)


In [27]:
x_test_2 = pd.read_csv("Data/x_test_2.csv")

In [28]:
print(x_test_2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567572 entries, 0 to 1567571
Data columns (total 7 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   source_katz              1567572 non-null  float64
 1   source_rank              1567572 non-null  float64
 2   destination_rank         1567572 non-null  float64
 3   source_random_walk       1567572 non-null  float64
 4   destination_random_walk  1567572 non-null  float64
 5   racn                     1567572 non-null  float64
 6   destination_katz         1567572 non-null  float64
dtypes: float64(7)
memory usage: 83.7 MB
None


In [29]:
x_test_2_new = x_test_2.copy()

In [30]:
x_test_2_new = reduce_mem_usage(x_test_2_new)

Memory usage of dataframe is 83.72 MB


100%|██████████| 7/7 [00:00<00:00, 26.36it/s]



Memory usage after optimization is: 41.859 MB
Decreased by 50.0%


In [31]:
for i in x_test_2.columns:
  print("diffrence in term of", i, "=", np.abs(np.sum(x_test_2[i] - x_test_2_new[i])))

diffrence in term of source_katz = 5.486643307455342e-07
diffrence in term of source_rank = 4.683450395564538e-05
diffrence in term of destination_rank = 4.95783926662708e-05
diffrence in term of source_random_walk = 0.00022982041179776813
diffrence in term of destination_random_walk = 0.00023456575222811915
diffrence in term of racn = 8.688801447223228e-05
diffrence in term of destination_katz = 6.311242697911411e-07


In [32]:
print(x_test_2_new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567572 entries, 0 to 1567571
Data columns (total 7 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   source_katz              1567572 non-null  float32
 1   source_rank              1567572 non-null  float32
 2   destination_rank         1567572 non-null  float32
 3   source_random_walk       1567572 non-null  float32
 4   destination_random_walk  1567572 non-null  float32
 5   racn                     1567572 non-null  float32
 6   destination_katz         1567572 non-null  float32
dtypes: float32(7)
memory usage: 41.9 MB
None


In [33]:
x_test_2_new.to_csv("Data/x_test_2.csv", index = False)

In [34]:
print("The Shape of x_test_2 before reformattring =", x_test_2.shape)
print("The Shape of x_test_2 after reformattring = ", x_test_2_new.shape)

The Shape of x_test_2 before reformattring = (1567572, 7)
The Shape of x_test_2 after reformattring =  (1567572, 7)


In [35]:
x_test_3 = pd.read_csv("Data/x_test_3.csv")

In [36]:
x_test_3.drop(["source_node", "destination_node", "label"], axis = 1, inplace = True)

In [37]:
print(x_test_3.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567572 entries, 0 to 1567571
Data columns (total 7 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   shortest_path_length   1567572 non-null  int64  
 1   same_community         1567572 non-null  int64  
 2   mutual_following       1567572 non-null  int64  
 3   source_weight_out      1567572 non-null  float64
 4   destination_weight_in  1567572 non-null  float64
 5   weight_multiplication  1567572 non-null  float64
 6   weight_addition        1567572 non-null  float64
dtypes: float64(4), int64(3)
memory usage: 83.7 MB
None


In [38]:
x_test_3_new = x_test_3.copy()

In [39]:
x_test_3_new = reduce_mem_usage(x_test_3_new)

Memory usage of dataframe is 83.72 MB


100%|██████████| 7/7 [00:00<00:00, 53.64it/s]



Memory usage after optimization is: 28.404 MB
Decreased by 66.1%


In [40]:
for i in x_test_3.columns:
  print("diffrence in term of", i, "=", np.abs(np.sum(x_test_3[i] - x_test_3_new[i])))

diffrence in term of shortest_path_length = 0
diffrence in term of same_community = 0
diffrence in term of mutual_following = 0
diffrence in term of source_weight_out = 0.004449374613625184
diffrence in term of destination_weight_in = 0.005392772407080665
diffrence in term of weight_multiplication = 0.000370295045064194
diffrence in term of weight_addition = 0.005515399447819785


In [41]:
print(x_test_3_new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567572 entries, 0 to 1567571
Data columns (total 7 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   shortest_path_length   1567572 non-null  int8   
 1   same_community         1567572 non-null  int8   
 2   mutual_following       1567572 non-null  int8   
 3   source_weight_out      1567572 non-null  float32
 4   destination_weight_in  1567572 non-null  float32
 5   weight_multiplication  1567572 non-null  float32
 6   weight_addition        1567572 non-null  float32
dtypes: float32(4), int8(3)
memory usage: 28.4 MB
None


In [42]:
print("The Shape of x_test_3 before reformattring =", x_test_3_new.shape)
print("The Shape of x_test_3 after reformattring = ", x_test_3_new.shape)

The Shape of x_test_3 before reformattring = (1567572, 7)
The Shape of x_test_3 after reformattring =  (1567572, 7)


In [43]:
x_test_3_new.to_csv("Data/x_test_3.csv", index = False)

### Reformate Validation Data

In [44]:
x_valid_1 = pd.read_csv("Data/x_valid_1.csv")

In [45]:
x_valid_1.drop(["source_node", "destination_node", "label"], axis = 1, inplace = True)
x_valid_1.drop(["source_outdegree", "destination_indegree"], axis = 1, inplace = True)

In [46]:
print(x_valid_1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508683 entries, 0 to 508682
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   salton_for_outcoming               508683 non-null  float64
 1   salton_for_incoming                508683 non-null  float64
 2   source_indegree                    508683 non-null  int64  
 3   destination_outdegree              508683 non-null  int64  
 4   outcoming_intersection             508683 non-null  int64  
 5   incoming_intersection              508683 non-null  int64  
 6   jaccard_for_outcoming              508683 non-null  float64
 7   jaccard_for_incoming               508683 non-null  float64
 8   sorensen_for_outcoming             508683 non-null  float64
 9   sorensen_for_incoming              508683 non-null  float64
 10  outcoming_preferential_attachment  508683 non-null  int64  
 11  incoming_preferential_attachment   5086

In [47]:
x_valid_1_new = x_valid_1.copy()

In [49]:
cols = ["source_indegree", "destination_outdegree", \
        "outcoming_intersection", "incoming_intersection"]
x_valid_1_new = reduce_mem_usage(x_valid_1_new, subset = cols)

Memory usage of dataframe is 87.32 MB


100%|██████████| 23/23 [00:00<00:00, 28.77it/s]



Memory usage after optimization is: 40.750 MB
Decreased by 53.3%


In [50]:
for i in x_valid_1.columns:
  print("diffrence in term of", i, "=", np.sum(np.abs(x_valid_1[i] - x_valid_1_new[i])))

diffrence in term of salton_for_outcoming = 0.000816529218713234
diffrence in term of salton_for_incoming = 0.0008727327938192805
diffrence in term of source_indegree = 0
diffrence in term of destination_outdegree = 0
diffrence in term of outcoming_intersection = 0
diffrence in term of incoming_intersection = 0
diffrence in term of jaccard_for_outcoming = 0.0005214801049962576
diffrence in term of jaccard_for_incoming = 0.0005549221308467998
diffrence in term of sorensen_for_outcoming = 0.00040160069298310276
diffrence in term of sorensen_for_incoming = 0.0004261367912969569
diffrence in term of outcoming_preferential_attachment = 0
diffrence in term of incoming_preferential_attachment = 0
diffrence in term of adamic_adar = 0.016215055728711292
diffrence in term of outcoming_hup_promoted = 0.0008504966525199392
diffrence in term of incoming_hup_promoted = 0.0009042828090734777
diffrence in term of outcoming_hup_depressed = 0.0006187486923129541
diffrence in term of incoming_hup_depress

In [51]:
print(x_valid_1_new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508683 entries, 0 to 508682
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   salton_for_outcoming               508683 non-null  float32
 1   salton_for_incoming                508683 non-null  float32
 2   source_indegree                    508683 non-null  uint16 
 3   destination_outdegree              508683 non-null  uint16 
 4   outcoming_intersection             508683 non-null  uint16 
 5   incoming_intersection              508683 non-null  uint16 
 6   jaccard_for_outcoming              508683 non-null  float32
 7   jaccard_for_incoming               508683 non-null  float32
 8   sorensen_for_outcoming             508683 non-null  float32
 9   sorensen_for_incoming              508683 non-null  float32
 10  outcoming_preferential_attachment  508683 non-null  int32  
 11  incoming_preferential_attachment   5086

In [52]:
print("The Shape of x_valid_1 before reformattring =", x_valid_1.shape)
print("The Shape of x_valid_1 after reformattring = ", x_valid_1_new.shape)

The Shape of x_valid_1 before reformattring = (508683, 23)
The Shape of x_valid_1 after reformattring =  (508683, 23)


In [53]:
x_valid_1_new.to_csv("Data/x_valid_1.csv", index = False)

In [54]:
x_valid_2 = pd.read_csv("Data/x_valid_2.csv")

In [55]:
print(x_valid_2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508683 entries, 0 to 508682
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   source_katz              508683 non-null  float64
 1   source_rank              508683 non-null  float64
 2   destination_rank         508683 non-null  float64
 3   source_random_walk       508683 non-null  float64
 4   destination_random_walk  508683 non-null  float64
 5   racn                     508683 non-null  float64
 6   destination_katz         508683 non-null  float64
dtypes: float64(7)
memory usage: 27.2 MB
None


In [56]:
x_valid_2_new = x_valid_2.copy()

In [57]:
x_valid_2_new = reduce_mem_usage(x_valid_2_new)

Memory usage of dataframe is 27.17 MB


100%|██████████| 7/7 [00:00<00:00, 100.75it/s]


Memory usage after optimization is: 13.583 MB
Decreased by 50.0%





In [58]:
for i in x_valid_2.columns:
  print("diffrence in term of", i, "=", np.abs(np.sum(x_valid_2[i] - x_valid_2_new[i])))

diffrence in term of source_katz = 1.8245570875831917e-07
diffrence in term of source_rank = 1.4568408979905023e-05
diffrence in term of destination_rank = 1.407679558804685e-05
diffrence in term of source_random_walk = 9.280834028227436e-05
diffrence in term of destination_random_walk = 8.49410066154771e-05
diffrence in term of racn = 3.316205588670978e-05
diffrence in term of destination_katz = 1.8609508967864542e-07


In [59]:
print(x_valid_2_new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508683 entries, 0 to 508682
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   source_katz              508683 non-null  float32
 1   source_rank              508683 non-null  float32
 2   destination_rank         508683 non-null  float32
 3   source_random_walk       508683 non-null  float32
 4   destination_random_walk  508683 non-null  float32
 5   racn                     508683 non-null  float32
 6   destination_katz         508683 non-null  float32
dtypes: float32(7)
memory usage: 13.6 MB
None


In [60]:
print("The Shape of x_valid_2 before reformattring =", x_valid_2.shape)
print("The Shape of x_valid_2 after reformattring = ", x_valid_2_new.shape)

The Shape of x_valid_2 before reformattring = (508683, 7)
The Shape of x_valid_2 after reformattring =  (508683, 7)


In [61]:
x_valid_2_new.to_csv("Data/x_valid_2.csv", index = False)

In [62]:
x_valid_3 = pd.read_csv("Data/x_valid_3.csv")

In [63]:
x_valid_3.drop(["source_node", "destination_node", "label"], axis = 1, inplace = True)

In [64]:
print(x_valid_3.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508683 entries, 0 to 508682
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   shortest_path_length   508683 non-null  int64  
 1   same_community         508683 non-null  int64  
 2   mutual_following       508683 non-null  int64  
 3   source_weight_out      508683 non-null  float64
 4   destination_weight_in  508683 non-null  float64
 5   weight_multiplication  508683 non-null  float64
 6   weight_addition        508683 non-null  float64
dtypes: float64(4), int64(3)
memory usage: 27.2 MB
None


In [65]:
x_valid_3_new = x_valid_3.copy()

In [66]:
x_valid_3_new = reduce_mem_usage(x_valid_3_new)

Memory usage of dataframe is 27.17 MB


100%|██████████| 7/7 [00:00<00:00, 165.90it/s]


Memory usage after optimization is: 9.217 MB
Decreased by 66.1%





In [67]:
for i in x_valid_3.columns:
  print("diffrence in term of", i, "=", np.abs(np.sum(x_valid_3[i] - x_valid_3_new[i])))

diffrence in term of shortest_path_length = 0
diffrence in term of same_community = 0
diffrence in term of mutual_following = 0
diffrence in term of source_weight_out = 0.001441808280225533
diffrence in term of destination_weight_in = 0.0017491394747898847
diffrence in term of weight_multiplication = 0.00011405158321948659
diffrence in term of weight_addition = 0.0017951595814931776


In [68]:
print(x_valid_3_new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508683 entries, 0 to 508682
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   shortest_path_length   508683 non-null  int8   
 1   same_community         508683 non-null  int8   
 2   mutual_following       508683 non-null  int8   
 3   source_weight_out      508683 non-null  float32
 4   destination_weight_in  508683 non-null  float32
 5   weight_multiplication  508683 non-null  float32
 6   weight_addition        508683 non-null  float32
dtypes: float32(4), int8(3)
memory usage: 9.2 MB
None


In [69]:
print("The Shape of x_valid_3 before  reformattring =", x_valid_3.shape)
print("The Shape of x_valid_3 after reformattring = ", x_valid_3_new.shape)

The Shape of x_valid_3 before  reformattring = (508683, 7)
The Shape of x_valid_3 after reformattring =  (508683, 7)


In [70]:
x_valid_3_new.to_csv("Data/x_valid_3.csv", index = False)