In [1]:
#Missing values dataset
import pandas as pd
import numpy as np

np.random.seed(42) # Set a seed for reproducibility

# Create a DataFrame with 50 rows and 5 columns
data = {
    'A': np.random.randint(1, 100, size=50),
    'B': np.random.randint(1, 100, size=50),
    'C': np.random.choice([1, 2, 3, 4, 5], size=50),
    'D': np.random.choice([10.5, 20.5, 30.5, 40.5, 50.5], size=50),
    'E': np.random.choice(['X', 'Y', 'Z'], size=50)
}

df = pd.DataFrame(data)

# Introduce random missing values (NaNs) in columns A, B, and C
missing_indices = np.random.choice(df.index, size=15, replace=False)  
# Randomly select 15 rows to be missing
df.loc[missing_indices, 'A'] = np.nan
df.loc[missing_indices, 'B'] = np.nan

# Introduce NaNs in a few other random spots
df.loc[np.random.choice(df.index, size=5, replace=False), 'C'] = np.nan
df.loc[np.random.choice(df.index, size=5, replace=False), 'D'] = np.nan

# Save the DataFrame to a CSV file
df.to_csv('missing_values_dataset.csv', index=False)

# Print the first few rows to see what it looks like
print(df.head())

      A     B    C     D  E
0  52.0   4.0  4.0  40.5  X
1  93.0  89.0  NaN  50.5  X
2  15.0  60.0  4.0  50.5  X
3  72.0  14.0  5.0  30.5  X
4  61.0   9.0  1.0   NaN  Z


In [2]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('missing_values_dataset.csv') 
# Display the original dataset with missing values
print("Original dataset with missing values:")
print(df)

# Impute missing values for column A with mean
df['A'].fillna(df['A'].mean(), inplace=True)

# Impute missing values for column B with median
df['B'].fillna(df['B'].median(), inplace=True)

# Impute missing values for column C with mode
df['C'].fillna(df['C'].mode()[0], inplace=True)

# Display the dataset after imputation
print("\nDataset after imputation:")
print(df)
#print(df.head())

# Save the updated DataFrame to a new CSV file
df.to_csv('imputed_missing_values_dataset.csv', index=False)

print("\nUpdated dataset saved as 'imputed_missing_values_dataset.csv'")

Original dataset with missing values:
       A     B    C     D  E
0   52.0   4.0  4.0  40.5  X
1   93.0  89.0  NaN  50.5  X
2   15.0  60.0  4.0  50.5  X
3   72.0  14.0  5.0  30.5  X
4   61.0   9.0  1.0   NaN  Z
5   21.0  90.0  5.0  40.5  X
6   83.0  53.0  5.0  50.5  Y
7   87.0   2.0  1.0  30.5  Y
8   75.0  84.0  1.0  30.5  Y
9    NaN   NaN  NaN  40.5  Z
10  88.0  60.0  1.0  20.5  X
11  24.0  71.0  4.0  20.5  X
12   NaN   NaN  NaN  50.5  X
13   NaN   NaN  3.0  10.5  Z
14   NaN   NaN  1.0  50.5  Y
15   2.0  35.0  3.0  40.5  Y
16  88.0  78.0  3.0  40.5  X
17   NaN   NaN  NaN  40.5  Y
18  38.0  36.0  3.0   NaN  Y
19   2.0  50.0  5.0  40.5  Z
20   NaN   NaN  2.0   NaN  Z
21  60.0   2.0  2.0  20.5  Z
22  21.0   6.0  1.0  40.5  Z
23  33.0  54.0  4.0  10.5  X
24  76.0   4.0  1.0  10.5  Z
25  58.0  54.0  4.0  10.5  Y
26   NaN   NaN  2.0  10.5  X
27  89.0  63.0  1.0   NaN  Y
28  49.0  18.0  NaN  10.5  Y
29  91.0  90.0  3.0  40.5  Y
30  59.0  44.0  4.0  50.5  Z
31  42.0  34.0  3.0  10.5  Z
32  9

In [4]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import os

# Load the dataset
file_path = 'data-for-cleaning.csv' 
data = pd.read_csv(file_path)

# Step 1: Handle missing values
# Impute numerical columns with mean
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
num_imputer = SimpleImputer(strategy='mean')
data[numerical_cols] = num_imputer.fit_transform(data[numerical_cols])

# Impute categorical columns with the most frequent value
categorical_cols = data.select_dtypes(include=['object']).columns
cat_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])

# Step 2: Remove outliers using the IQR method
def remove_outliers(df, cols):
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# Apply outlier removal to numerical columns
data = remove_outliers(data, numerical_cols)

# Step 3: Standardize numerical columns
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Save the cleaned dataset
output_file_path = 'cleaned_data.csv'
data.to_csv(output_file_path, index=False)

print(f"Data cleaned and saved to {output_file_path}")


Data cleaned and saved to cleaned_data.csv


In [3]:
import pandas as pd
# --- Optional: Self-Made Dataset (Uncomment to use) ---
data = {
     'user_id': [1, 2, 3, 4, 4, 5, 6, 7],
     'game_title': ['A', 'B', 'C', 'D', 'D', 'E', 'F', 'G'],
     'behavior_name': ['play', 'purchase', 'play', 'purchase', 'purchase', 'play', 'play', 'play'],
     'value': [5, 1, 100, 1, 1, 120, 4, 3],
     'x': [None] * 8  # Optional unnecessary column
 }
df = pd.DataFrame(data)
df.to_csv('steam_sample.csv', index=False)

# --- Step 1: Load Dataset ---
try:
    df = pd.read_csv("steam_sample.csv")
except FileNotFoundError:
    print("Dataset not found. Please make sure 'steam_sample.csv' is present.")
    exit()

print("Original Dataset Preview:")
print(df.head())

# --- Step 2: Remove Duplicates ---
df_no_duplicates = df.drop_duplicates()
print("\nAfter Removing Duplicates:")
print(df_no_duplicates.head())

# --- Step 3: Handle Missing Values ---
for col in df_no_duplicates.columns:
    if pd.api.types.is_numeric_dtype(df_no_duplicates[col]):
        df_no_duplicates[col].fillna(df_no_duplicates[col].mean(), inplace=True)
    else:
        df_no_duplicates[col].fillna(df_no_duplicates[col].mode()[0], inplace=True)

print("\nAfter Handling Missing Values:")
print(df_no_duplicates.head())

# --- Step 4: Handle Outliers in 'value' column ---
Q1 = df_no_duplicates['value'].quantile(0.25)
Q3 = df_no_duplicates['value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_no_outliers = df_no_duplicates[(df_no_duplicates['value'] >= lower_bound) & (df_no_duplicates['value'] <= upper_bound)]

print("\nAfter Removing Outliers:")
print(df_no_outliers.head())

# --- Step 5: Drop Unnecessary Columns (e.g., 'x') ---
if 'x' in df_no_outliers.columns:
    df_no_outliers = df_no_outliers.drop(columns=['x'])

print("\nFinal Cleaned Dataset:")
print(df_no_outliers.head())

Original Dataset Preview:
   user_id game_title behavior_name  value   x
0        1          A          play      5 NaN
1        2          B      purchase      1 NaN
2        3          C          play    100 NaN
3        4          D      purchase      1 NaN
4        4          D      purchase      1 NaN

After Removing Duplicates:
   user_id game_title behavior_name  value   x
0        1          A          play      5 NaN
1        2          B      purchase      1 NaN
2        3          C          play    100 NaN
3        4          D      purchase      1 NaN
5        5          E          play    120 NaN

After Handling Missing Values:
   user_id game_title behavior_name  value   x
0        1          A          play      5 NaN
1        2          B      purchase      1 NaN
2        3          C          play    100 NaN
3        4          D      purchase      1 NaN
5        5          E          play    120 NaN

After Removing Outliers:
   user_id game_title behavior_name  value

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_duplicates[col].fillna(df_no_duplicates[col].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_duplicates[col].fillna(df_no_duplicates[col].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_duplicates[col].fillna(df_no_duplicates[col].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u