In [1]:
import pandas as pd

# Load each file to check for outliers
file_paths = [
    "C:/Users/ATAISH NEHRA/Downloads/Cleaned Datasets/co_concentrations.csv",
    "C:/Users/ATAISH NEHRA/Downloads/Cleaned Datasets/surface_temp_clean.csv",
    "C:/Users/ATAISH NEHRA/Downloads/Cleaned Datasets/sea_level_changes.csv",
    "C:/Users/ATAISH NEHRA/Downloads/Cleaned Datasets/climate_disasters_clean.csv",
    "C:/Users/ATAISH NEHRA/Downloads/Cleaned Datasets/Forest and Carbon_cleaned.csv",
    "C:/Users/ATAISH NEHRA/Downloads/Cleaned Datasets/Land Cover Accounts_cleaned.csv"
]

# Creating a dictionary to store DataFrames
dataframes = {}

# Loading the data into pandas DataFrames
for file_path in file_paths:
    file_name = file_path.split('/')[-1].split('.')[0]  # Extracting file name
    dataframes[file_name] = pd.read_csv(file_path)

# Displaying the first few rows of each DataFrame to understand their structure
for file_name, df in dataframes.items():
    print(f"First few rows of {file_name}:\n", df.head(), "\n")



First few rows of co_concentrations:
        CO PPM  CO Percent
0  356.545833    0.237500
1  357.215000    0.187500
2  358.959167    0.488333
3  360.968333    0.560833
4  362.743333    0.492500 

First few rows of surface_temp_clean:
    Year  Afghanistan, Islamic Rep. of  Albania  Algeria  American Samoa  \
0  1961                        -0.113    0.627    0.164           0.079   
1  1962                        -0.164    0.326    0.114          -0.042   
2  1963                         0.847    0.075    0.077           0.169   
3  1964                        -0.764   -0.166    0.250          -0.140   
4  1965                        -0.244   -0.388   -0.100          -0.562   

   Andorra, Principality of  Angola  Anguilla  Antigua and Barbuda  Argentina  \
0                     0.736   0.041     0.086                0.090      0.122   
1                     0.112  -0.152    -0.024                0.031     -0.046   
2                    -0.752  -0.190     0.234                0.288     

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

# Function to detect outliers using IQR method
def detect_outliers(df, columns):
    outlier_indices = []

    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outlier_list_col = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)].index
        outlier_indices.extend(outlier_list_col)

    outlier_indices = list(set(outlier_indices))
    return outlier_indices

# Analyzing each dataset for outliers
outlier_summary = {}

for file_name, df in dataframes.items():
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    outliers = detect_outliers(df, numeric_cols)
    outlier_summary[file_name] = {
        'total_rows': df.shape[0],
        'outlier_count': len(outliers),
        'outlier_percentage': len(outliers) / df.shape[0] * 100
    }

outlier_summary

{'co_concentrations': {'total_rows': 32,
  'outlier_count': 4,
  'outlier_percentage': 12.5},
 'surface_temp_clean': {'total_rows': 62,
  'outlier_count': 17,
  'outlier_percentage': 27.419354838709676},
 'sea_level_changes': {'total_rows': 31,
  'outlier_count': 1,
  'outlier_percentage': 3.225806451612903},
 'climate_disasters_clean': {'total_rows': 43,
  'outlier_count': 3,
  'outlier_percentage': 6.976744186046512},
 'Forest and Carbon_cleaned': {'total_rows': 29,
  'outlier_count': 0,
  'outlier_percentage': 0.0},
 'Land Cover Accounts_cleaned': {'total_rows': 29,
  'outlier_count': 0,
  'outlier_percentage': 0.0}}