## Importing Libraries

In [None]:
import os
import pandas as pd
import glob

## Extract All Files and Combine all in one file

In [6]:
all_data = []

# Define the root directory containing the folders
root_directory = "C:/Users/Compu Dream/Desktop/Data for A Realistic and Public Dataset with Rare Undesirable Real Events in Oil Wells/data"

# Loop through all folders
for folder in os.listdir(root_directory):
    folder_path = os.path.join(root_directory, folder)
    if os.path.isdir(folder_path):  # Check if it's a directory
        # Search for all CSV files in the current folder
        csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
        
        for file in csv_files:
            print(f"Reading file: {file}")  # Optional: Display progress
            try:
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file)
                all_data.append(df)  # Append DataFrame to the list
            except Exception as e:
                print(f"Error reading file {file}: {e}")

# Combine all DataFrames into a single DataFrame
if all_data:
    combined_df = pd.concat(all_data, ignore_index=True)
    print("All files have been combined successfully!")
    
    # save to a new CSV file
    combined_df.to_csv("combined_output.csv", index=False)
    print("Combined data has been saved to 'combined_output.csv'")
else:
    print("No CSV files found to combine.")


Reading file: C:/Users/Compu Dream/Desktop/Data for A Realistic and Public Dataset with Rare Undesirable Real Events in Oil Wells/data\0\WELL-00001_20170201020207.csv
Reading file: C:/Users/Compu Dream/Desktop/Data for A Realistic and Public Dataset with Rare Undesirable Real Events in Oil Wells/data\0\WELL-00001_20170201070114.csv
Reading file: C:/Users/Compu Dream/Desktop/Data for A Realistic and Public Dataset with Rare Undesirable Real Events in Oil Wells/data\0\WELL-00001_20170201120124.csv
Reading file: C:/Users/Compu Dream/Desktop/Data for A Realistic and Public Dataset with Rare Undesirable Real Events in Oil Wells/data\0\WELL-00001_20170201170311.csv
Reading file: C:/Users/Compu Dream/Desktop/Data for A Realistic and Public Dataset with Rare Undesirable Real Events in Oil Wells/data\0\WELL-00001_20170201220228.csv
Reading file: C:/Users/Compu Dream/Desktop/Data for A Realistic and Public Dataset with Rare Undesirable Real Events in Oil Wells/data\0\WELL-00001_20170202030343.cs

## Read Data

In [4]:
 data = pd.read_csv('combined_output.csv')

In [5]:
data.head()

Unnamed: 0,timestamp,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
0,2017-02-01 02:02:07.000000,0.0,10092110.0,119.0944,1609800.0,84.59782,1564147.0,,0.0,0.0
1,2017-02-01 02:02:08.000000,0.0,10092000.0,119.0944,1618206.0,84.58997,1564148.0,,0.0,0.0
2,2017-02-01 02:02:09.000000,0.0,10091890.0,119.0944,1626612.0,84.58213,1564148.0,,0.0,0.0
3,2017-02-01 02:02:10.000000,0.0,10091780.0,119.0944,1635018.0,84.57429,1564148.0,,0.0,0.0
4,2017-02-01 02:02:11.000000,0.0,10091670.0,119.0944,1643424.0,84.56644,1564148.0,,0.0,0.0


In [6]:
data.shape

(50913215, 10)

In [7]:
null = pd.DataFrame({'Null Values' : data.isna().sum().sort_values(ascending=False), 'Percentage Null Values' : (data.isna().sum().sort_values(ascending=False)) / (data.shape[0]) * (100)})
null.style.background_gradient(cmap="Wistia")

Unnamed: 0,Null Values,Percentage Null Values
T-JUS-CKGL,50913215,100.0
P-JUS-CKGL,40814636,80.165112
QGL,40130864,78.822098
T-TPT,5810976,11.413493
T-JUS-CKP,1702789,3.344493
P-MON-CKP,1121628,2.203019
P-TPT,6184,0.012146
P-PDG,5905,0.011598
class,5130,0.010076
timestamp,0,0.0


`We have a three columns most of values contain null values so we drop them.`

In [8]:
data = data.drop(['T-JUS-CKGL' , 'P-JUS-CKGL' ,'QGL'] , axis = 1)

In [9]:
null = pd.DataFrame({'Null Values' : data.isna().sum().sort_values(ascending=False), 'Percentage Null Values' : (data.isna().sum().sort_values(ascending=False)) / (data.shape[0]) * (100)})
null.style.background_gradient(cmap="Wistia")

Unnamed: 0,Null Values,Percentage Null Values
T-TPT,5810976,11.413493
T-JUS-CKP,1702789,3.344493
P-MON-CKP,1121628,2.203019
P-TPT,6184,0.012146
P-PDG,5905,0.011598
class,5130,0.010076
timestamp,0,0.0


In [10]:
data.shape

(50913215, 7)

## Save Data as a CSV file

In [11]:
data.to_csv("combined_output_cleaned_part_1.csv", index=False)
print("combined_output_cleaned_part_1 has been saved to 'combined_output_cleaned_part_1.csv'")

combined_output_cleand_part_1 has been saved to 'combined_output_cleand_part_1.csv'
