In [2]:
import pandas as pd

# Load the datasets
mine_df = pd.read_csv(r'Mine.csv')
sales_df = pd.read_csv(r'Sales.csv')


In [3]:

# 1. Remove unnecessary columns
# Assuming 'Total Revenue' is unnecessary in Sales dataset
if 'Total Revenue' in sales_df.columns:
    sales_df.drop(columns=['Total Revenue'], inplace=True)



In [4]:

# 2. Handle missing values
# Fill numerical columns with mean and categorical columns with 'Unknown'
mine_df.fillna({
    col: mine_df[col].mean() if mine_df[col].dtype in ['float64', 'int64'] else 'Unknown'
    for col in mine_df.columns
}, inplace=True)

sales_df.fillna({
    col: sales_df[col].mean() if sales_df[col].dtype in ['float64', 'int64'] else 'Unknown'
    for col in sales_df.columns
}, inplace=True)



In [13]:


# 3. Convert date columns to consistent formats
for col in mine_df.columns:
    if 'date' in col.lower():
        mine_df[col] = pd.to_datetime(mine_df[col], errors='coerce')

for col in sales_df.columns:
    if 'date' in col.lower():
        sales_df[col] = pd.to_datetime(sales_df[col], errors='coerce')

print(sales_df)



   Order ID  Customer Name Order Date   Product   Quantity  Unit Price
0      1001       John Doe 2024-01-01  Widget A  10.000000   25.000000
1      1002     Jane Smith 2024-01-02  Widget B   5.000000   40.000000
2      1003        Unknown        NaT  Widget A   5.142857   25.000000
3      1004  Alice Johnson 2024-04-01  Widget C   3.000000   35.714286
4      1005      Bob Brown        NaT  Widget B  10.000000   40.000000
5      1006       John Doe 2024-06-01  Widget A   4.000000   25.000000
7      1007     Jane Smith 2024-07-01  Widget C   6.190476   70.000000


In [6]:

# 4. Drop duplicate rows
mine_df.drop_duplicates(inplace=True)
sales_df.drop_duplicates(inplace=True)


In [7]:

# 5. Handle wrong data
# Replace negative values in numerical columns with column mean
def replace_negatives_with_mean(df):
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        mean_value = df[col][df[col] >= 0].mean()
        df[col] = df[col].apply(lambda x: mean_value if x < 0 else x)

replace_negatives_with_mean(mine_df)
replace_negatives_with_mean(sales_df)


In [8]:

# Save cleaned datasets to new CSV files
mine_df.to_csv('Mine_cleaned.csv', index=False)
sales_df.to_csv('Sales_cleaned.csv', index=False)


In [9]:

# Output the cleaned datasets
mine_df_cleaned = pd.read_csv('Mine_cleaned.csv')
sales_df_cleaned = pd.read_csv('Sales_cleaned.csv')

mine_df_cleaned.head(), sales_df_cleaned.head()


(   Duration        Date  Pulse  Maxpulse  Calories
 0        60  2023-10-01  110.0     130.0     409.1
 1        60  2023-10-02  117.0     145.0     479.0
 2        60  2023-10-03  103.0     135.0     340.3
 3        45  2023-10-04  109.0     175.0     282.4
 4        45  2023-10-05  117.0     150.0     405.1,
    Order ID  Customer Name  Order Date   Product   Quantity  Unit Price
 0      1001       John Doe  2024-01-01  Widget A  10.000000   25.000000
 1      1002     Jane Smith  2024-01-02  Widget B   5.000000   40.000000
 2      1003        Unknown         NaN  Widget A   5.142857   25.000000
 3      1004  Alice Johnson  2024-04-01  Widget C   3.000000   35.714286
 4      1005      Bob Brown         NaN  Widget B  10.000000   40.000000)

In [10]:
# 1. Missing values / Empty cells
# The script uses fillna to fill missing values in numerical 
# columns with the column mean and categorical columns with 'Unknown'.

# 2. Inconsistent date formats
# The script uses pd.to_datetime to convert all date columns to a consistent datetime format.

# 3. Duplicate rows
# The script uses drop_duplicates to remove duplicate rows from both datasets.

# 4. Wrong data
# The script defines a function replace_negatives_with_mean to replace 
# negative values in numerical columns with the mean of the non-negative values in that column.

# 5. Unnecessary columns that are not relevant to the analysis
# The script checks for the presence of a column named 'Total Revenue' in 
# the Sales dataset and drops it if it exists.