**Data Loading**


In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("Tech_Layoffs_Tracker.csv")
df.head()

Unnamed: 0,Company,Location HQ,# Laid Off,Date,%,Industry,Source,Stage,$ Raised (mm),Country,Date Added
0,Tract,"London,Non-U.S.",,4/3/2025,,Real Estate,https://sifted.eu/articles/ai-proptech-startup...,Unknown,,United Kingdom,4/6/2025
1,Automattic,SF Bay Area,281.0,4/2/2025,16%,Other,https://techcrunch.com/2025/04/02/wordpress-ma...,Series E,$986,United States,4/3/2025
2,Canva,"Sydney,Non-U.S.",10.0,4/2/2025,,Consumer,https://www.afr.com/technology/canva-shocks-em...,Unknown,$2500,Australia,4/3/2025
3,WhyHive,"Melbourne,Non-U.S.",,4/2/2025,100%,Data,https://www.startupdaily.net/topic/business/da...,Seed,,Australia,4/6/2025
4,Northvolt,"Stockholm,Non-U.S.",2800.0,3/31/2025,62%,Energy,https://sifted.eu/articles/northvolt-lays-off-...,Unknown,$13800,Sweden,4/2/2025


**Data Exploration**

In [None]:
#top rows and the columns of the dataframe
df.head()

Unnamed: 0,Company,Location HQ,# Laid Off,Date,%,Industry,Source,Stage,$ Raised (mm),Country,Date Added
0,Tract,"London,Non-U.S.",,4/3/2025,,Real Estate,https://sifted.eu/articles/ai-proptech-startup...,Unknown,,United Kingdom,4/6/2025
1,Automattic,SF Bay Area,281.0,4/2/2025,16%,Other,https://techcrunch.com/2025/04/02/wordpress-ma...,Series E,$986,United States,4/3/2025
2,Canva,"Sydney,Non-U.S.",10.0,4/2/2025,,Consumer,https://www.afr.com/technology/canva-shocks-em...,Unknown,$2500,Australia,4/3/2025
3,WhyHive,"Melbourne,Non-U.S.",,4/2/2025,100%,Data,https://www.startupdaily.net/topic/business/da...,Seed,,Australia,4/6/2025
4,Northvolt,"Stockholm,Non-U.S.",2800.0,3/31/2025,62%,Energy,https://sifted.eu/articles/northvolt-lays-off-...,Unknown,$13800,Sweden,4/2/2025


In [None]:
#check for the important informations of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4026 entries, 0 to 4025
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company        4026 non-null   object 
 1   Location HQ    4025 non-null   object 
 2   # Laid Off     2625 non-null   float64
 3   Date           4026 non-null   object 
 4   %              2564 non-null   object 
 5   Industry       4025 non-null   object 
 6   Source         4023 non-null   object 
 7   Stage          4019 non-null   object 
 8   $ Raised (mm)  3584 non-null   object 
 9   Country        4026 non-null   object 
 10  Date Added     4026 non-null   object 
dtypes: float64(1), object(10)
memory usage: 346.1+ KB


This shows the column names, non-null value counts and the data types of the column values.

In [None]:
# Stats for numeric columns
df.describe()

Unnamed: 0,# Laid Off
count,2625.0
mean,269.159619
std,862.618777
min,3.0
25%,40.0
50%,84.0
75%,200.0
max,15000.0


This shows the basic statistical inoformations of the numerical columns of the dataset.

In [None]:
# List of column names
df.columns

Index(['Company', 'Location HQ', '# Laid Off', 'Date', '%', 'Industry',
       'Source', 'Stage', '$ Raised (mm)', 'Country', 'Date Added'],
      dtype='object')

This shows the column names of the dataset.

In [None]:
# check for the unique values in each columns
df.nunique()

Unnamed: 0,0
Company,2777
Location HQ,264
# Laid Off,363
Date,1009
%,73
Industry,30
Source,3781
Stage,16
$ Raised (mm),713
Country,66


This shows the total number of the unique values in each columns of the dataset.

**Data Cleaning**


In [None]:
# Check for the missing values in the dataset
df.isnull().sum()      #check for the total number of null values in each columns

Unnamed: 0,0
Company,0
Location HQ,1
# Laid Off,1401
Date,0
%,1462
Industry,1
Source,3
Stage,7
$ Raised (mm),442
Country,0


This shows that the Laid Off, % and Raised(mm) column has the highest number of missing values.

In [None]:
def Handle_Missing_Values(df):
    """
    Cleans missing values and formats key columns in the Layoffs dataset.
    """
    # Drop rows with missing Company or Date (essential fields)
    df = df.dropna(subset=["Company", "Date"])

    # Fill categorical columns
    df["Location HQ"] = df["Location HQ"].fillna(df["Location HQ"].mode()[0])
    df["Industry"] = df["Industry"].fillna("Unknown")
    df["Source"] = df["Source"].fillna("Unavailable")
    df["Stage"] = df["Stage"].fillna("Unknown")

    # Convert % column from string to float
    df['%'] = df['%'].astype(str).str.replace('%', '', regex=False)
    df['%'] = pd.to_numeric(df['%'], errors='coerce')

    # Convert '$ Raised (mm)' column from string to float
    df['$ Raised (mm)'] = df['$ Raised (mm)'].astype(str).str.replace('$', '', regex=False)
    df['$ Raised (mm)'] = pd.to_numeric(df['$ Raised (mm)'], errors='coerce')

    # Fill numeric columns
    df['# Laid Off'] = df.groupby('Industry')['# Laid Off'].transform(lambda x: x.fillna(x.median()))
    df['# Laid Off'] = df['# Laid Off'].fillna(df['# Laid Off'].median())

    df['%'] = df.groupby('Industry')['%'].transform(lambda x: x.fillna(x.median()))
    df['%'] = df.groupby('Stage')['%'].transform(lambda x: x.fillna(x.median()))
    df['%'] = df['%'].fillna(df['%'].median())

    df['$ Raised (mm)'] = df.groupby('Stage')['$ Raised (mm)'].transform(lambda x: x.fillna(x.median()))
    df['$ Raised (mm)'] = df['$ Raised (mm)'].fillna(df['$ Raised (mm)'].median())
    return df

df = Handle_Missing_Values(df)

In [None]:
# CHeck for the missing values after applying handling methods
df.isnull().sum()

Unnamed: 0,0
Company,0
Location HQ,0
# Laid Off,0
Date,0
%,0
Industry,0
Source,0
Stage,0
$ Raised (mm),0
Country,0


Missing Values handled successfully.


**Duplicate Values**

In [None]:
# Check for the Duplicate Values in the dataset
df.duplicated().any() # Check if there is any duplicate value in the dataset

np.False_

As the result is false, it means there is not duplicate value in the dataset.

In [None]:
df[df.duplicated()]

Unnamed: 0,Company,Location HQ,# Laid Off,Date,%,Industry,Source,Stage,$ Raised (mm),Country,Date Added


It confirms that there is no duplicate row in the dataframe.

In [None]:
# Format the dates in the same format : dd-mm-yyyy
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Date Added'] = pd.to_datetime(df['Date Added'], errors='coerce')

In [None]:
# Change the Laid off and % column name for better understanding
df = df.rename(columns={'# Laid Off': 'Number_of_Layoff', '%': 'Layoff_Percentage', '$ Raised (mm)' : 'Raised_Amount($mm)', 'Date': 'Layoff_Date'})
# Remove '%' and '$' sign and convert Layoff_Percentage and Raised_Amount($mm) to numeric for calculations
df['Layoff_Percentage'] = df['Layoff_Percentage'].astype(str).str.replace('%', '', regex=False)
df['Raised_Amount($mm)'] = df['Raised_Amount($mm)'].astype(str).str.replace('$', '', regex=False)
df['Layoff_Percentage'] = pd.to_numeric(df['Layoff_Percentage'], errors='coerce')
df['Raised_Amount($mm)'] = pd.to_numeric(df['Raised_Amount($mm)'], errors='coerce')
df.head()

Unnamed: 0,Company,Location HQ,Number_of_Layoff,Layoff_Date,Layoff_Percentage,Industry,Source,Stage,Raised_Amount($mm),Country,Date Added
0,Tract,"London,Non-U.S.",100.0,2025-04-03,22.0,Real Estate,https://sifted.eu/articles/ai-proptech-startup...,Unknown,180.0,United Kingdom,2025-04-06
1,Automattic,SF Bay Area,281.0,2025-04-02,16.0,Other,https://techcrunch.com/2025/04/02/wordpress-ma...,Series E,986.0,United States,2025-04-03
2,Canva,"Sydney,Non-U.S.",10.0,2025-04-02,17.0,Consumer,https://www.afr.com/technology/canva-shocks-em...,Unknown,2500.0,Australia,2025-04-03
3,WhyHive,"Melbourne,Non-U.S.",79.0,2025-04-02,100.0,Data,https://www.startupdaily.net/topic/business/da...,Seed,4.0,Australia,2025-04-06
4,Northvolt,"Stockholm,Non-U.S.",2800.0,2025-03-31,62.0,Energy,https://sifted.eu/articles/northvolt-lays-off-...,Unknown,13800.0,Sweden,2025-04-02


In [None]:
# remove the Non-U.S. label from the Location HQ column
df['Location HQ'] = df['Location HQ'].str.replace(',? ?Non-U\.S\.', '', regex=True).str.strip()
df.head()

Unnamed: 0,Company,Location HQ,Number_of_Layoff,Layoff_Date,Layoff_Percentage,Industry,Source,Stage,Raised_Amount($mm),Country,Date Added
0,Tract,London,100.0,2025-04-03,22.0,Real Estate,https://sifted.eu/articles/ai-proptech-startup...,Unknown,180.0,United Kingdom,2025-04-06
1,Automattic,SF Bay Area,281.0,2025-04-02,16.0,Other,https://techcrunch.com/2025/04/02/wordpress-ma...,Series E,986.0,United States,2025-04-03
2,Canva,Sydney,10.0,2025-04-02,17.0,Consumer,https://www.afr.com/technology/canva-shocks-em...,Unknown,2500.0,Australia,2025-04-03
3,WhyHive,Melbourne,79.0,2025-04-02,100.0,Data,https://www.startupdaily.net/topic/business/da...,Seed,4.0,Australia,2025-04-06
4,Northvolt,Stockholm,2800.0,2025-03-31,62.0,Energy,https://sifted.eu/articles/northvolt-lays-off-...,Unknown,13800.0,Sweden,2025-04-02



**Feature Engineering**

In [None]:
# Extract the month and year column from the layoff date

# Ensure it's datetime format (even though it already looks like it)
df['Layoff_Date'] = pd.to_datetime(df['Layoff_Date'], errors='coerce')

# Extract full month name (like "April")
df['Layoff_Month'] = df['Layoff_Date'].dt.month

# Extract year (like 2025)
df['Layoff_Year'] = df['Layoff_Date'].dt.year

# Extract Quarter
df['Layoff_Quarter'] = df['Layoff_Date'].dt.quarter
df.head()

Unnamed: 0,Company,Location HQ,Number_of_Layoff,Layoff_Date,Layoff_Percentage,Industry,Source,Stage,Raised_Amount($mm),Country,Date Added,Layoff_Month,Layoff_Year,Layoff_Quarter
0,Tract,London,100.0,2025-04-03,22.0,Real Estate,https://sifted.eu/articles/ai-proptech-startup...,Unknown,180.0,United Kingdom,2025-04-06,4,2025,2
1,Automattic,SF Bay Area,281.0,2025-04-02,16.0,Other,https://techcrunch.com/2025/04/02/wordpress-ma...,Series E,986.0,United States,2025-04-03,4,2025,2
2,Canva,Sydney,10.0,2025-04-02,17.0,Consumer,https://www.afr.com/technology/canva-shocks-em...,Unknown,2500.0,Australia,2025-04-03,4,2025,2
3,WhyHive,Melbourne,79.0,2025-04-02,100.0,Data,https://www.startupdaily.net/topic/business/da...,Seed,4.0,Australia,2025-04-06,4,2025,2
4,Northvolt,Stockholm,2800.0,2025-03-31,62.0,Energy,https://sifted.eu/articles/northvolt-lays-off-...,Unknown,13800.0,Sweden,2025-04-02,3,2025,1


In [None]:
# Delete the source and the Date Added column as they were redundent
df = df.drop(['Source', 'Date Added'], axis=1)

In [None]:
# Add new derived columns for the simplicity
# Layoff_Severity_Level: (Low < 50, Medium < 500, High >= 500 layoffs)
df['Layoff_Severity'] = pd.cut(df['Number_of_Layoff'],
                               bins=[0, 50, 500, df['Number_of_Layoff'].max()],
                               labels=['Low', 'Medium', 'High'])

# Funding Category (Small < $50M, Mid < $250M, Large > $250M)
# Create Funding_Category column based on Raised_Amount($mm)
df['Funding_Category'] = pd.cut(df['Raised_Amount($mm)'],
                                bins=[-1, 50, 250, df['Raised_Amount($mm)'].max()],
                                labels=['low_funded', 'Medium_funded', 'high_funded'])

In [None]:
df.duplicated().any() # check for the duplicate rows after the feature engineering

np.True_

In [None]:
# Check for exact duplicate rows
duplicate_rows = df[df.duplicated()]
print(f"Total duplicate rows: {duplicate_rows.shape[0]}")

Total duplicate rows: 2


In [None]:
# View duplicate entries
df[df.duplicated()].head()

Unnamed: 0,Company,Location HQ,Number_of_Layoff,Layoff_Date,Layoff_Percentage,Industry,Stage,Raised_Amount($mm),Country,Layoff_Month,Layoff_Year,Layoff_Quarter,Layoff_Severity,Funding_Category
2548,Beyond Meat,Los Angeles,200.0,2022-10-14,19.0,Food,Post-IPO,122.0,United States,10,2022,4,Medium,Medium_funded
3159,Cazoo,London,750.0,2022-06-07,15.0,Transportation,Post-IPO,2000.0,United Kingdom,6,2022,2,High,high_funded


In [None]:
df = df.drop_duplicates() # delete the duplicate rows

In [None]:
df.reset_index(drop=True, inplace=True) # Reset Index After Dropping - This helps clean up the index after row removal.

In [None]:
df.duplicated().any() # Check again for the duplicate values and confirm

np.False_

In [None]:
df['Number_of_Layoff'] = df['Number_of_Layoff'].fillna(0).astype(int) # Convert number_of_layoff from float to int type

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Number_of_Layoff'] = df['Number_of_Layoff'].fillna(0).astype(int) # Convert number_of_layoff from float to int type


In [None]:
df.to_csv('Layoffs_cleaned.csv', index=False)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Company             4024 non-null   object        
 1   Location HQ         4024 non-null   object        
 2   Number_of_Layoff    4024 non-null   int64         
 3   Layoff_Date         4024 non-null   datetime64[ns]
 4   Layoff_Percentage   4024 non-null   float64       
 5   Industry            4024 non-null   object        
 6   Stage               4024 non-null   object        
 7   Raised_Amount($mm)  4024 non-null   float64       
 8   Country             4024 non-null   object        
 9   Layoff_Month        4024 non-null   int32         
 10  Layoff_Year         4024 non-null   int32         
 11  Layoff_Quarter      4024 non-null   int32         
 12  Layoff_Severity     4024 non-null   category      
 13  Funding_Category    4024 non-null   category    