In [41]:
import pandas as pd

In [42]:
df = pd.read_csv('NCR_DATA.csv', encoding='latin-1')

# Discovery and Analysis

In [43]:
df.columns

Index(['Site Area', 'PTR Number', 'Type', 'Subject', 'Organisation',
       'Organisation Code', 'Project Area', 'Required Close Out Date',
       'Closed Out Date', 'Days Late', 'Date Initiated', 'Discipline',
       'Estimated_Cost_of_NCR',
       'NCR Classification,\r\n\teB_Proposed_Disposition AS [Proposed_Disposition',
       'Final_Disposition', 'Category', 'Status', 'Root Cause', 'Stage',
       'Detailed Description'],
      dtype='object')

In [44]:
# Checking missing values
df.isnull().sum()

Unnamed: 0,0
Site Area,789
PTR Number,1
Type,1
Subject,1
Organisation,1
Organisation Code,3612
Project Area,2259
Required Close Out Date,3
Closed Out Date,88
Days Late,3


Selected and extracted the two most relevant columns

In [45]:
# New data frame
df_new = df[['Root Cause', 'Detailed Description']]

In [46]:
# Going to remove the missing values in both colums
df_new.dropna(subset=['Detailed Description', 'Root Cause'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.dropna(subset=['Detailed Description', 'Root Cause'], inplace=True)


In [47]:
df_new

Unnamed: 0,Root Cause,Detailed Description
4,"Design Problem - Drawing, specification, or da...",Headroom is not sufficient at the ground floor...
5,Management Problem - Other,No information was provided for surveillance Q...
6,Design Problem - Inadequate or defective design,Further to Cxxx-NCR-002458 and Cxxx-NCR-002480...
64,Discipline/Workmanship - Defective construction,The expansion joint C-Crimp for link 28 on the...
65,Discipline/Workmanship - Defective construction,120mm 1C earth cables are installed between Ea...
...,...,...
25227,Discipline/Workmanship - Other,During the pour of the Area A of the level -5 ...
25228,Discipline/Workmanship - Defective construction,The Buttress wall has been constructed out of ...
25229,Discipline/Workmanship - Not Defined,The level 3.5 slab has been constructed out of...
25230,Discipline/Workmanship - Defective construction,__ have informed London Underground that some ...


In [48]:
# Analyze the 'Root Cause' column
root_cause_column = df_new['Root Cause']

# Get unique values and their counts
unique_values = root_cause_column.value_counts(dropna=False)

# Get the count of missing values
missing_count = root_cause_column.isna().sum()

# Display unique values and their counts, missing count
print("Unique values and their counts:")
print(unique_values.to_string())
print(f"\nNumber of missing values: {missing_count}")

Unique values and their counts:
Root Cause
Discipline/Workmanship - Defective construction                                                       4670
Discipline/Workmanship - Other                                                                        2893
Design Problem - Other                                                                                1484
Management Problem - Inadequate supervision                                                           1013
Material/Preservation Problems - Other                                                                 845
Design Problem - Inadequate or defective design                                                        774
Discipline/Workmanship - Prerequisites not satisfied                                                   621
Management Problem - Other                                                                             563
Management Problem - Inadequate administrative control                                               

The original dataset contained 95 unique root cause values, many of which had subcategories and some appeared infrequently. This creates class imbalance and makes it difficult to predict

To address this, I'm going to extract only the main category before the dash to simplify the labels.

# Data Transformation

In [49]:
# Extract main category before the first dash (if applicable)
df_new['Root_Cause_Main'] = df_new['Root Cause'].apply(lambda x: x.split(' - ')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Root_Cause_Main'] = df_new['Root Cause'].apply(lambda x: x.split(' - ')[0])


In [50]:
unique_values = df_new['Root_Cause_Main'].value_counts(dropna=False)
print(unique_values.to_string())

Root_Cause_Main
Discipline/Workmanship            9333
Design Problem                    3601
Management Problem                3336
Material/Preservation Problems    2095
Personnel Error                    579
Procedure Problem                  576
Subcontractor Problems             547
Supervision                        458
Verification Problems              359
External Phenomenon                306
Previous Construction              156
Training Deficiency                 23
Constructability                     2


Target value still seems imblanced.I will group categories with fewer than 1000 occurrences into 'Other' to improve model performance and ensure better balance in the dataset.

In [51]:
# Count occurrences of each main category
cat_counts = df_new['Root_Cause_Main'].value_counts()

# Identify low-frequency categories
low_freq_cat = cat_counts[cat_counts < 1000].index.tolist()

# Group low-frequency categories into 'Other'
df_new['Root_Cause_Main'] = df_new['Root_Cause_Main'].apply(lambda x: 'Other' if x in low_freq_cat else x)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Root_Cause_Main'] = df_new['Root_Cause_Main'].apply(lambda x: 'Other' if x in low_freq_cat else x)


In [52]:
unique_values = df_new['Root_Cause_Main'].value_counts(dropna=False)
print(unique_values.to_string())

Root_Cause_Main
Discipline/Workmanship            9333
Design Problem                    3601
Management Problem                3336
Other                             3006
Material/Preservation Problems    2095
