In [69]:
# import the neccessary dependacies we will use by default
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder

Some insights into this project:
- The category of machine learning models that these fall into is _supervised learning_. Supervised learning is a type of machine learning where the model is trained on labeled data. The label in this case is the method of data breach.

The models that will be tested out will fall into the following category of machine learning:
* <u><b>Logistic Regression</b></u>: This is a supervised Learning that can be used to predict a categorical outcome. In this case, the categorical outcome is the method of the data breach. The features that will be used are: entity, year, records and organization type.
* <u><b>Decision Trees</b></u>: This is another supervised learning algorithm that can be used to predict a categorical outcome. Decision trees work by creating a tree-like structure that represents the relationships between the features and the outcomes.
* <u><b>Support Vector Machine(SVMs):</u></b> This is a supervised learning algorithm that can be used to predict both categorical and continous outcomes. SVMs work by finding the hyperplane that best seperates the data points into different classes.
* <u><b>Random Forests</u></b> This is an ensemble learning algorithm that combines multiple decision trees to improve the accuracy of the predictions.
* <u><b>Neural Networks</u></b> This is a more complex algorithm that can be used to predict both categorical and continuous outcomes. Neural Networks work by learning the relationships between the features and the outcome through a process called backpropogation.

In [70]:
# load the dataset(s) we will be using
csv_file_path = os.path.abspath('df_1.csv');  # get the absolute path of the CS

df = pd.read_csv(csv_file_path);   # Read the CSV file into a datafram
# display the head to see if the dataset works as intended
df.head(10)  #adjust the parameter value as needed

Unnamed: 0.1,Unnamed: 0,Entity,Year,Records,Organization type,Method,Sources
0,0,21st Century Oncology,2016,2200000,healthcare,hacked,[5][6]
1,1,500px,2020,14870304,social networking,hacked,[7]
2,2,Accendo Insurance Co.,2020,175350,healthcare,poor security,[8][9]
3,3,Adobe Systems Incorporated,2013,152000000,tech,hacked,[10]
4,4,Adobe Inc.,2019,7500000,tech,poor security,[11][12]
5,5,Advocate Medical Group,2017,4000000,healthcare,lost / stolen media,[13][14]
6,6,AerServ (subsidiary of InMobi),2018,75000,advertising,hacked,[15]
7,7,"Affinity Health Plan, Inc.",2013,344579,healthcare,lost / stolen media,[16][17]
8,8,Airtel,2019,320000000,telecommunications,poor security,[18]
9,9,Air Canada,2018,20000,transport,hacked,[19]


From my exploration, I discovered that dataframe 3 and dataframe 1 are the same, so merging them would be useless, dataframe 2 is different from dataframe 1, therefore, merging them holds some value.

In [71]:
df.dtypes  # simply lists out the datatype we are working with

Unnamed: 0            int64
Entity               object
Year                 object
Records              object
Organization type    object
Method               object
Sources              object
dtype: object

In [72]:
df.columns # observe the column list

Index(['Unnamed: 0', 'Entity', 'Year', 'Records', 'Organization type',
       'Method', 'Sources'],
      dtype='object')

In [73]:
# we will need to rename the unnamed column
df.rename(columns={'Unnamed: 0' : 'Index'}, inplace=True)

In [74]:
# Test out the number of unique values contained in Entity
len(df['Entity'].unique())   #there's a total of 331 unique name for comapnies

331

Regarding the column __Entity__, there's 2 options in terms of data preprocessing:
* Drop the column altogether
* Apply Hashing

In [75]:
import hashlib

#create a hash function
def hash_function(text):
    return hashlib.sha256(text.encode()).hexdigest()

# create a new column for the hashed values
df['hashed_Entity'] = df['Entity'].apply(hash_function)

#observe the hashed column
df.head()  # we have the hashed entity

Unnamed: 0,Index,Entity,Year,Records,Organization type,Method,Sources,hashed_Entity
0,0,21st Century Oncology,2016,2200000,healthcare,hacked,[5][6],c3a872f67b2c21449ed696e019d008cbb15b384ed63164...
1,1,500px,2020,14870304,social networking,hacked,[7],39b1d7f5c52ecec4511c7d93214a3dc33915fcb67c51e1...
2,2,Accendo Insurance Co.,2020,175350,healthcare,poor security,[8][9],d37b82c7c3fef3eb56a05bc88697bd64c489f8b13b9ed8...
3,3,Adobe Systems Incorporated,2013,152000000,tech,hacked,[10],4ea8e5179ec6c6866f040ebfa23a41946cb05bd37b5ebc...
4,4,Adobe Inc.,2019,7500000,tech,poor security,[11][12],4e2eb72d6ffd4a841309dcd87b646d2408ac7394956117...


In [76]:
# create a copy of the dataframe and drop the Entity column 
df_copy1 = df.drop(columns={'Entity'})

In [77]:
df_copy1  # we don't want to make modifications to the original dataframe 

Unnamed: 0,Index,Year,Records,Organization type,Method,Sources,hashed_Entity
0,0,2016,2200000,healthcare,hacked,[5][6],c3a872f67b2c21449ed696e019d008cbb15b384ed63164...
1,1,2020,14870304,social networking,hacked,[7],39b1d7f5c52ecec4511c7d93214a3dc33915fcb67c51e1...
2,2,2020,175350,healthcare,poor security,[8][9],d37b82c7c3fef3eb56a05bc88697bd64c489f8b13b9ed8...
3,3,2013,152000000,tech,hacked,[10],4ea8e5179ec6c6866f040ebfa23a41946cb05bd37b5ebc...
4,4,2019,7500000,tech,poor security,[11][12],4e2eb72d6ffd4a841309dcd87b646d2408ac7394956117...
...,...,...,...,...,...,...,...
347,347,2019,173000000,social network,hacked,[406][407],637e6be11e5f556343bad0073ff64ddcf396a88cec35d7...
348,348,2020,200000000,financial,accidentally published,[408],985ee128022c38982d1de4c4173f538ced9151f0517fe9...
349,349,2020,391250,healthcare,poor security,[409],869ada24f3639cb94ed2dd22ec15aca5928cb7b7c0bc28...
350,350,2022,6400000,various,poor security,[410] [411],89def538702db6ab06b39b1d28f5f7980de01dd04bd0a0...


In [78]:
len(df_copy1['Organization type'].unique())   # in terms of organization type, there's 70 different unique values
df_copy1

Unnamed: 0,Index,Year,Records,Organization type,Method,Sources,hashed_Entity
0,0,2016,2200000,healthcare,hacked,[5][6],c3a872f67b2c21449ed696e019d008cbb15b384ed63164...
1,1,2020,14870304,social networking,hacked,[7],39b1d7f5c52ecec4511c7d93214a3dc33915fcb67c51e1...
2,2,2020,175350,healthcare,poor security,[8][9],d37b82c7c3fef3eb56a05bc88697bd64c489f8b13b9ed8...
3,3,2013,152000000,tech,hacked,[10],4ea8e5179ec6c6866f040ebfa23a41946cb05bd37b5ebc...
4,4,2019,7500000,tech,poor security,[11][12],4e2eb72d6ffd4a841309dcd87b646d2408ac7394956117...
...,...,...,...,...,...,...,...
347,347,2019,173000000,social network,hacked,[406][407],637e6be11e5f556343bad0073ff64ddcf396a88cec35d7...
348,348,2020,200000000,financial,accidentally published,[408],985ee128022c38982d1de4c4173f538ced9151f0517fe9...
349,349,2020,391250,healthcare,poor security,[409],869ada24f3639cb94ed2dd22ec15aca5928cb7b7c0bc28...
350,350,2022,6400000,various,poor security,[410] [411],89def538702db6ab06b39b1d28f5f7980de01dd04bd0a0...


In [79]:
# remove the special characters
df_copy1['Organization type'] = df_copy1['Organization type'].str.replace(',', '_')  # replace the cases of ',' with '_'.
df_copy1['Organization type'] = df_copy1['Organization type'].str.replace(' ', '_') # we also replace the cases of spacing with '_'.
df_copy1['Organization type'] = df_copy1['Organization type'].str.replace('__', '_')  # replace all instances of __ with _

Before encoding the organization type, I've copied the df to keep the original data intact. I'm dropping the hashed_Entity column as it is not needed for visualizing – we want to focus on industries and not individual companies.

In [80]:
df_vis = df_copy1  #create another copy of the df_copy and perform further data preprocessing
#drop the hashed entity column
df_vis.drop(columns={'hashed_Entity'}, inplace=True)  #note: after dropping the column the first time, you may end up running into an error if you were to rerun the code block again
df_vis

Unnamed: 0,Index,Year,Records,Organization type,Method,Sources
0,0,2016,2200000,healthcare,hacked,[5][6]
1,1,2020,14870304,social_networking,hacked,[7]
2,2,2020,175350,healthcare,poor security,[8][9]
3,3,2013,152000000,tech,hacked,[10]
4,4,2019,7500000,tech,poor security,[11][12]
...,...,...,...,...,...,...
347,347,2019,173000000,social_network,hacked,[406][407]
348,348,2020,200000000,financial,accidentally published,[408]
349,349,2020,391250,healthcare,poor security,[409]
350,350,2022,6400000,various,poor security,[410] [411]


Checking if the Year column is well formatted (should be a year like 2016, or 2019, not 2016-2019). If not, we'll need to do some data cleaning.

In [81]:
#Check if any value in the year column is null
print("Null values: ", df_vis['Year'].isnull().values.any())  # there are no null values in the year column

#Check if any value in the year column is not well formatted (i.e. not a number)
print("Non numeric values: ", df_vis['Year'].str.isnumeric().values.any())  # there are values that are not numeric

#print out all the values in the year column that are not numeric
print("Non numeric values:")
df_vis[~df_vis['Year'].str.isnumeric()]  # as we can see, the three columns that have non-numeric values are in index 94 96 and 144

Null values:  False
Non numeric values:  True
Non numeric values:


Unnamed: 0,Index,Year,Records,Organization type,Method,Sources
94,94,2019-2020,"9,000,000 (approx) - basic booking, 2208 (cred...",transport,hacked,[119][120]
96,96,2018-2019,2000000,restaurant,hacked,[122]
144,144,2014 and 2015,363000,hotel,hacked,[194][195]


In [82]:
#having index and the dataframe indexing is redundant
df_vis.drop('Index', inplace=True, axis=1)

In [83]:
#observe the dataset to see if the colum index has been successfully dropped
df_vis  # as we can see, the column index has been successfully dropped

Unnamed: 0,Year,Records,Organization type,Method,Sources
0,2016,2200000,healthcare,hacked,[5][6]
1,2020,14870304,social_networking,hacked,[7]
2,2020,175350,healthcare,poor security,[8][9]
3,2013,152000000,tech,hacked,[10]
4,2019,7500000,tech,poor security,[11][12]
...,...,...,...,...,...
347,2019,173000000,social_network,hacked,[406][407]
348,2020,200000000,financial,accidentally published,[408]
349,2020,391250,healthcare,poor security,[409]
350,2022,6400000,various,poor security,[410] [411]


There are 3 values in the Year column that are not well formed – we also will need to fix the Records column for similar formatting issues.

In [84]:
'''
Modify df_vis:
for each value in the year column that is not numeric:
    record the last year listed in the year column (i.e. the last 4 characters)
    record the first year listed in the year column (i.e. the first 4 characters)
change the year column of this entry to the first 4 characters (the first year)
for each year between the first year and the last year:
    add a new entry to the dataframe with the same values as the entry that was changed, except for the year column, which will be the year in question
'''
last_row_index = df_vis.tail(1).index[0]
# Create an empty list to store modified rows
new_rows = []

# Iterate through the DataFrame
for index, row in df_vis.iterrows():
    year_value = row['Year']
    
    # Check if the year is not numeric
    if not year_value.isnumeric():
        # Extract the first and last year
        first_year = int(year_value[:4])
        last_year = int(year_value[-4:])

        # Change the year column to the first year
        df_vis.loc[index, 'Year'] = str(first_year)
        
        #Create new rows for each year between the first and last year
        for year in range(first_year + 1, last_year + 1):
            last_row_index+=1 # Increment the index of the last row
            new_row = row.copy()  # Create a copy of the current row
            new_row['Year'] = str(year)
            new_row['Index'] = last_row_index
            new_rows.append(new_row) # Append the new row to the list

# Concatenate the new rows with the original DataFrame
new_rows_df = pd.DataFrame(new_rows)

df_vis = pd.concat([df_vis, new_rows_df], ignore_index=True)

#essentially, rather than having multiple year, it was replaced by the first year that pops up.
df_vis

Unnamed: 0,Year,Records,Organization type,Method,Sources,Index
0,2016,2200000,healthcare,hacked,[5][6],
1,2020,14870304,social_networking,hacked,[7],
2,2020,175350,healthcare,poor security,[8][9],
3,2013,152000000,tech,hacked,[10],
4,2019,7500000,tech,poor security,[11][12],
...,...,...,...,...,...,...
350,2022,6400000,various,poor security,[410] [411],
351,2022,95000,retail,accidentally published,[412],
352,2020,"9,000,000 (approx) - basic booking, 2208 (cred...",transport,hacked,[119][120],352.0
353,2019,2000000,restaurant,hacked,[122],353.0


The Year's column is fixed, but I suspect this approach may be exaggerating the number of breaches in 2019.
Instead, I will fix the Records column, and then split (divide?) the number of records by the number of years the breach lasted.


In [85]:
df_vis['Organization type'].unique()   

array(['healthcare', 'social_networking', 'tech', 'advertising',
       'telecommunications', 'transport', 'web', 'financial', 'gaming',
       'tech_retail', 'telecoms', 'dating', 'government', 'academic',
       'retail', 'educational_services', 'gambling', 'banking', 'game',
       'energy', 'background_check', 'information_technology',
       'hosting_provider', 'military', 'health', 'consulting_accounting',
       'political', 'messaging_app', 'restaurant',
       'financial_credit_reporting', 'data_broker', 'social_network',
       'financial_service_company', 'Information_Security',
       'personal_and_demographic_data_about_residents_and_their_properties_of_US',
       'hotel', 'humanitarian', 'special_public_corporation', 'shopping',
       'local_search', 'mobile_carrier', 'publisher_(magazine)',
       'government_healthcare', 'software', 'web_military',
       'online_shopping', 'genealogy', 'media', 'telecom',
       'QR_code_payment', 'fashion', 'Clinical_Laboratory',
  

In [86]:
le = LabelEncoder()  #perform label encoding on the organization_type, the copy of df_vis

# implement label encoding on the Organization type column
le.fit(df_vis['Organization type'])   # fit the data we want to train the encoder on
df_vis['Organization type'] = le.transform(df_vis['Organization type'])
# observe how the column 'Organization Type' has changed
df_vis.head()

Unnamed: 0,Year,Records,Organization type,Method,Sources,Index
0,2016,2200000,30,hacked,[5][6],
1,2020,14870304,53,hacked,[7],
2,2020,175350,30,poor security,[8][9],
3,2013,152000000,56,hacked,[10],
4,2019,7500000,56,poor security,[11][12],


In [87]:
df_vis.dtypes   # as we can see, the Organization type changed from Object --> integer datatype

Year                  object
Records               object
Organization type      int32
Method                object
Sources               object
Index                float64
dtype: object

In [88]:
# observe the label frequency, to gain an understanding of outliers and inlier values
df_vis['Organization type'].value_counts()    # some values to note: 7 repeats 13 times, 18 repeats 38  times, 23 repeats 12 times, 25 repeats 30 times, 30 repeats 47 times, 49 repeats 27 times.

65    53
30    47
18    38
25    30
49    27
      ..
14     1
47     1
11     1
54     1
64     1
Name: Organization type, Length: 70, dtype: int64

In [89]:
df_vis['Method']  # we will also need to perform label encoding on the method section

0                      hacked
1                      hacked
2               poor security
3                      hacked
4               poor security
                ...          
350             poor security
351    accidentally published
352                    hacked
353                    hacked
354                    hacked
Name: Method, Length: 355, dtype: object

In [90]:
df_vis['Method'].unique()

array(['hacked', 'poor security', 'lost / stolen media',
       'accidentally published', 'inside job, hacked',
       'lost / stolen computer', 'unknown', 'unsecured S3 bucket',
       'inside job', nan, 'accidentally uploaded',
       'poor security / hacked', 'Poor security', 'unprotected api',
       'poor security/inside job', 'data exposed by misconfiguration',
       'zero-day vulnerabilities', 'intentionally lost',
       'misconfiguration/poor security', 'ransomware hacked',
       'rogue contractor', 'improper setting, hacked',
       'hacked/misconfiguration',
       'publicly accessible Amazon Web Services (AWS) server',
       'accidentally exposed', 'social engineering'], dtype=object)

In [91]:
""# seems like the method column contains certain NaN values and certain Unknown values, we will need to determine whether we replace such values or drop them in its entirety
df_vis['Method'].value_counts()     

hacked                                                  195
poor security                                            43
lost / stolen media                                      33
accidentally published                                   21
inside job                                               19
lost / stolen computer                                   16
unknown                                                   7
improper setting, hacked                                  2
poor security/inside job                                  2
intentionally lost                                        1
accidentally exposed                                      1
publicly accessible Amazon Web Services (AWS) server      1
hacked/misconfiguration                                   1
rogue contractor                                          1
ransomware hacked                                         1
misconfiguration/poor security                            1
unprotected api                         

In [92]:
#check which columns have null/unknoown values
null_mask = df.isnull().any(axis=1)
null_rows=df[null_mask]

print(null_rows)

     Index                               Entity  Year    Records  \
40      40                Bethesda Game Studios  2018        NaN   
69      69  Philippines Commission on Elections  2016   55000000   
83      83        Democratic National Committee  2016      19252   
192    192             Mobile TeleSystems (MTS)  2019  100000000   
337    337                            WordPress  2018        NaN   

      Organization type                          Method Sources  \
40               gaming          accidentally published    [53]   
69           government                          hacked     NaN   
83            political                             NaN   [106]   
192  telecommunications  misconfiguration/poor security     NaN   
337         web service                          hacked   [392]   

                                         hashed_Entity  
40   8506a2a312971641f5628bc44178b2c41e564141d10d4e...  
69   f7ddf80072a8eecdad46535cd4e2d9303e0bd21d124193...  
83   d3e8e2cd093c

In [93]:
#drop sources and index
df_vis.drop(columns=['Index', 'Sources'], axis=1, inplace=True)
df_vis

Unnamed: 0,Year,Records,Organization type,Method
0,2016,2200000,30,hacked
1,2020,14870304,53,hacked
2,2020,175350,30,poor security
3,2013,152000000,56,hacked
4,2019,7500000,56,poor security
...,...,...,...,...
350,2022,6400000,64,poor security
351,2022,95000,49,accidentally published
352,2020,"9,000,000 (approx) - basic booking, 2208 (cred...",63,hacked
353,2019,2000000,48,hacked


In [94]:
"""Two changes needs to be made regarding the method column
    1. Replace the "Unknown" value with the most frequently repeated word
    2. Drop any NaN values if it exists
"""
most_frequent_word = df_vis['Method'].value_counts().index[0]
#replace all occurences of "Unknown" with "Hacked"
df_vis['Method'].replace('unknown', most_frequent_word, inplace=True)
# drop any NaN values
df_vis.dropna(inplace=True)
# Print the dataframe
df_vis

Unnamed: 0,Year,Records,Organization type,Method
0,2016,2200000,30,hacked
1,2020,14870304,53,hacked
2,2020,175350,30,poor security
3,2013,152000000,56,hacked
4,2019,7500000,56,poor security
...,...,...,...,...
350,2022,6400000,64,poor security
351,2022,95000,49,accidentally published
352,2020,"9,000,000 (approx) - basic booking, 2208 (cred...",63,hacked
353,2019,2000000,48,hacked


In [95]:
# let's verify if the method column contains any nan/null values
df_vis.isnull().all()   # seems that none of the values here contain any more null values, as we can see none of the datatype has any null values now

Year                 False
Records              False
Organization type    False
Method               False
dtype: bool

In [96]:
print(list(df_vis['Method'].unique()))   # based on the list, we can see that the Method column doesn't contain unknown anymore.
list(df_vis['Method'].value_counts())   # we also gain insight into the frequency of the methods being repeated, there's 24 methods, therefore, the label encoding will range from 0-23

['hacked', 'poor security', 'lost / stolen media', 'accidentally published', 'inside job, hacked', 'lost / stolen computer', 'unsecured S3 bucket', 'inside job', 'accidentally uploaded', 'poor security / hacked', 'Poor security', 'unprotected api', 'poor security/inside job', 'data exposed by misconfiguration', 'zero-day vulnerabilities', 'intentionally lost', 'misconfiguration/poor security', 'ransomware hacked', 'rogue contractor', 'improper setting, hacked', 'hacked/misconfiguration', 'publicly accessible Amazon Web Services (AWS) server', 'accidentally exposed', 'social engineering']


[201, 43, 33, 20, 19, 16, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [97]:
# prior to implementing label encoding, we will need to clear up the string of some grammatical issues that will cause mismatched data when we train the model otherwise
# we want uniformity for the data input values
# convert all uppercase letters to lowercase
df_vis['Method'] = df_vis['Method'].str.lower()

# replace all spaces with "_"
df_vis['Method'] = df_vis['Method'].str.replace(' ', '_')

# replace all slash signs with "_"
df_vis['Method'] = df_vis['Method'].str.replace('/', '_')

# replace all __ and ___ with _
df_vis['Method'] = df_vis['Method'].str.replace('___', '_')
df_vis['Method'] = df_vis['Method'].str.replace('__', '_')

# check the updated dataframe
list(df_vis['Method'].unique())

['hacked',
 'poor_security',
 'lost_stolen_media',
 'accidentally_published',
 'inside_job,_hacked',
 'lost_stolen_computer',
 'unsecured_s3_bucket',
 'inside_job',
 'accidentally_uploaded',
 'poor_security_hacked',
 'unprotected_api',
 'poor_security_inside_job',
 'data_exposed_by_misconfiguration',
 'zero-day_vulnerabilities',
 'intentionally_lost',
 'misconfiguration_poor_security',
 'ransomware_hacked',
 'rogue_contractor',
 'improper_setting,_hacked',
 'hacked_misconfiguration',
 'publicly_accessible_amazon_web_services_(aws)_server',
 'accidentally_exposed',
 'social_engineering']

In [98]:
# now the method column is ready for label encoding preprocessing, since we already called on the label encoder previously, we can reuse it
le.fit(df_vis['Method'])  # train the label encoder on the column data we want to train
df_vis['Method'] = le.transform(df_vis['Method'])
df_vis   # observe that method has been successfully encoded by label

Unnamed: 0,Year,Records,Organization type,Method
0,2016,2200000,30,4
1,2020,14870304,53,4
2,2020,175350,30,13
3,2013,152000000,56,4
4,2019,7500000,56,13
...,...,...,...,...
350,2022,6400000,64,13
351,2022,95000,49,1
352,2020,"9,000,000 (approx) - basic booking, 2208 (cred...",63,4
353,2019,2000000,48,4


In [99]:
# check for Records values
df_vis['Records'].str.isnumeric() 

0       True
1       True
2       True
3       True
4       True
       ...  
350     True
351     True
352    False
353     True
354     True
Name: Records, Length: 352, dtype: bool

In [61]:
def g(df_vis):
    return df_vis[~df_vis['Records'].str.isnumeric()]

result = g(df_vis.copy())
print(result)
print("Number of non-numerical rows:", len(result))  #output: 46, meaning there's a total of 46 rows that contains non numerical values

     Year                                            Records  \
10   2019                                            unknown   
25   2021                                            unknown   
27   2015                                  G20 world leaders   
28   2019                                   19 years of data   
34   2012                                          63 stores   
41   2020                                            unknown   
47   2015                                  tens of thousands   
48   2019                                     over 5,000,000   
66   2020                              unknown (client list)   
76   2015                                           millions   
80   2017                                             235 GB   
82   2017                                 350 clients emails   
94   2019  9,000,000 (approx) - basic booking, 2208 (cred...   
104  2014                                            unknown   
109  2016                               

In [107]:
df_vis['Records'].value_counts() 
df_vis['Records'].replace('unknown', 'NaN', inplace=True)
pd.to_numeric(df_vis['Records'], errors='coerce')  # this will replace all the values in records into numerical values

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,


In [None]:
#continue here  --> https://machinelearningmastery.com/building-a-regression-model-in-pytorch/

In [108]:
df_vis['Records'].value_counts()

NaN              21
1500000           7
1000000           6
500000            6
160000            4
                 ..
250 locations     1
92000000          1
80000000          1
720000            1
95000             1
Name: Records, Length: 254, dtype: int64

In [None]:
# let's take a look at the minimum and maximum values
print('min:', min(df_copy1['Method']))
print('max:', max(df_copy1['Method']))  # the smallest value is represented using 0 and the largest value is represented using 22

In [None]:
df_copy1['Method'].value_counts()  # judging by the values shown here, seems like 4 represents 'hacked'

In [None]:
# Use the inverse_transform method if you need to decode the method back to the original text
original_text = le.inverse_transform([4])
original_text[0]

In [None]:
df_copy1.dtypes  # reobserve the data, as we can see, the columns that are of object datatype needs to be changed

In [None]:
object_to_numeric = ['Year', 'Records', 'hashed_Entity']
df_copy1[object_to_numeric] = df_copy1[object_to_numeric].apply(pd.to_numeric, errors="ignore", axis=1)   # we have successfully converted the dataframe from object to float, this ensures it's ready to be trained using machine learning model
df_copy1.dtypes

In [None]:
df_copy1

In [None]:
df_copy1.isnull().values.any()   # the entirety of the dataframe does not contain any null values.

In [None]:
df.columns

Marks the end of the data preprcoessing.

In [None]:
# using the original tabel where the name of the Entities as well as the hashed entities are together, using that we can create a lookup table in the form of a dictionary

dictionary = {}  # create a dictionary that maps the hashed company names to the original company names

for index, row in df.iterrows():
    hashed_company_name = row["hashed_Entity"]
    original_company_name = row["Entity"]
    dictionary[hashed_company_name] = original_company_name
    

In [None]:
dictionary   # we have successfully created a dictionary that maps the hashed values to the name of the original companues

In [None]:
# add a new column to the DataFrame that maps the hashed company names to the original company names
df_copy1["original_Entity"] = df["hashed_Entity"].apply(lambda x: dictionary[x])

In [None]:
df_copy1

In [None]:
# convert all uppercase letters to lowercase
df_vis['Method'] = df_vis['Method'].str.lower()

# replace all spaces with "_"
df_vis['Method'] = df_vis['Method'].str.replace(' ', '_')

# replace all slash signs with "_"
df_vis['Method'] = df_vis['Method'].str.replace('/', '_')

# replace all __ and ___ with _
df_vis['Method'] = df_vis['Method'].str.replace('___', '_')
df_vis['Method'] = df_vis['Method'].str.replace('__', '_')

# Checking the Records column

# Use the 'pd.to_numeric' function to check if the values in 'Records' can be converted to numbers
numeric_records = pd.to_numeric(df_vis['Records'], errors='coerce')

# Create a boolean mask where 'numeric_records' is NaN (indicating non-numeric values)
non_numeric_rows = df_vis[numeric_records.isna()]

# Print the rows where 'Records' contains non-numeric values
print(non_numeric_rows)

# Calculate the mean of 'Records' in year with NaN values
mean_records_2019 = df_vis[df_vis['Year'] == '2019']['Records'].mean()

df_vis.at[94, 'Records'] = mean_records_2019

# Create a boolean mask where 'numeric_records' is NaN (indicating non-numeric values)
non_numeric_rows = df_vis[numeric_records.isna()]

# Print the rows where 'Records' contains non-numeric values
print("After replacing NaN Records:")
print(non_numeric_rows)
#df_vis.head()

In [None]:
plt.figure(figsize=(12, 6))

# Sort the data by count in descending order
order = df_vis['Organization type'].value_counts().index

sns.countplot(data=df_vis, x='Organization type', order=order)
plt.title('Count of Records by Organization Type')
plt.xlabel('Organization Type')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))

# Group the data by year and calculate the total records lost in each year
data_lost_by_year = df_vis.groupby('Year')['Records'].sum().reset_index()

# Create a line plot to visualize data loss over the years
sns.lineplot(data=data_lost_by_year, x='Year', y='Records')
plt.title('Data Loss Over the Years')
plt.xlabel('Year')
plt.ylabel('Total Records Lost')
plt.grid(True)
plt.show()