In [1]:
import pandas as pd

def count_null_values(file_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Count null values in each column
    null_counts = df.isnull().sum()

    # Display the null counts
    print("Null values count per column:")
    print(null_counts)

    total_rows = len(df)

    # Display the total row count
    print("Total number of rows:")
    print(total_rows)

# Example usage
input_file = 'amazon.csv'  # Replace with your actual input file path
count_null_values(input_file)

Null values count per column:
Unnamed: 0             0
name                   0
main_category          0
sub_category           0
image                  0
link                   0
ratings           175794
no_of_ratings     175794
discount_price     61163
actual_price       17813
dtype: int64
Total number of rows:
551585


In [2]:
import pandas as pd

def clean_data(file_path, output_file_path):
    # Load the CSV file
    df = pd.read_csv(file_path)

    # Drop rows with any missing values
    df.dropna(inplace=True)

    # Remove duplicate rows
    df.drop_duplicates(inplace=True)

    # Optional: Reset index after dropping rows
    df.reset_index(drop=True, inplace=True)

    # Save the cleaned data to a new CSV file
    df.to_csv(output_file_path, index=False)
    print(f"Data cleaned and saved to {output_file_path}")

# Example usage
input_file = 'amazon.csv'  # Replace with your actual input file path
output_file = 'cleaned_amazon.csv'  # Path to save the cleaned data
clean_data(input_file, output_file)


Data cleaned and saved to cleaned_amazon.csv


In [3]:
import pandas as pd

def count_null_values(file_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Count null values in each column
    null_counts = df.isnull().sum()

    # Display the null counts
    print("Null values count per column:")
    print(null_counts)

    total_rows = len(df)

    # Display the total row count
    print("Total number of rows:")
    print(total_rows)

# Example usage
input_file = 'cleaned_amazon.csv'  # Replace with your actual input file path
count_null_values(input_file)

Null values count per column:
Unnamed: 0        0
name              0
main_category     0
sub_category      0
image             0
link              0
ratings           0
no_of_ratings     0
discount_price    0
actual_price      0
dtype: int64
Total number of rows:
340680


In [4]:
import pandas as pd
import numpy as np


In [5]:
df = pd.read_csv('cleaned_amazon.csv')

In [6]:
print(df.columns)

Index(['Unnamed: 0', 'name', 'main_category', 'sub_category', 'image', 'link',
       'ratings', 'no_of_ratings', 'discount_price', 'actual_price'],
      dtype='object')


In [7]:
df['Age'] = np.random.randint(18, 70, size=df.shape[0])
genders = ['Male', 'Female']
df['Gender'] = np.random.choice(genders, size=df.shape[0])
occasions = ['Anniversery', 'Birthday', 'Baby & Expecting', 'Diwali', 'Mothers Day',
 'Friendshipday', 'Christmas', 'New Year', 'Chrismas', 'Navaratri',
 'Raksha Bandan', "Father's Day", 'Friendship', 'Friendship day', 'New year',
 'Raksha Bandhan', 'Fathers Day', 'Friendship Day']
df['Occasion'] = np.random.choice(occasions, size=df.shape[0])

In [8]:
print(df[['Age', 'Gender', 'Occasion']])

        Age  Gender        Occasion
0        20  Female   Friendshipday
1        54  Female     Fathers Day
2        27  Female       Navaratri
3        56  Female  Raksha Bandhan
4        22  Female     Anniversery
...     ...     ...             ...
340675   25  Female  Friendship day
340676   27  Female       Christmas
340677   50    Male       Christmas
340678   60    Male          Diwali
340679   29    Male        New Year

[340680 rows x 3 columns]


In [9]:
def assign_relationship(row):
    if row['Gender'] == 'Female':
        return np.random.choice(['Wife', 'Sister', 'Daughter', 'Mother', 'For Her', 'wife'])
    elif row['Gender'] == 'Male':
        return np.random.choice(['Brother', 'Father', 'Son', 'For Him', 'Husband'])
    else:
        # Randomly assign male or female attributes to gender-neutral roles
        neutral_roles = ['Kids', 'Friend', 'Boss', 'Colleague']
        # Extend the choice to include gender indication
        role = np.random.choice(neutral_roles)
        role_gender = np.random.choice(['Male', 'Female'])
        return f"{role} ({role_gender})"

# Apply function to each row
df['Relationship'] = df.apply(assign_relationship, axis=1)

In [10]:
def assign_occasion(row):
    if row['Relationship'] in ['Wife', 'Husband']:
        return 'Anniversary'
    elif row['Relationship'] == 'Mother':
        return "Mother's Day"
    elif row['Relationship'] == 'Father':
        return "Father's Day"
    elif row['Relationship'] in ['Brother', 'Sister']:
        return 'Raksha Bandhan'
    else:
        # Assign random occasions to other relationships
        return np.random.choice(['Birthday', 'Graduation', 'Wedding', 'Retirement'])

df['Occasion'] = df.apply(assign_occasion, axis=1)

In [11]:
print(df[['Relationship', 'Gender', 'Occasion']])

       Relationship  Gender        Occasion
0              wife  Female      Graduation
1           For Her  Female      Graduation
2            Sister  Female  Raksha Bandhan
3              wife  Female        Birthday
4              Wife  Female     Anniversary
...             ...     ...             ...
340675       Sister  Female  Raksha Bandhan
340676     Daughter  Female         Wedding
340677      Husband    Male     Anniversary
340678      Brother    Male  Raksha Bandhan
340679      Brother    Male  Raksha Bandhan

[340680 rows x 3 columns]


In [12]:
df.to_csv('updated_dataset.csv', index=False)

In [13]:
print(df.head())

   Unnamed: 0                                               name  \
0           0  Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...   
1           1  LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...   
2           2  LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...   
3           3  LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...   
4           4  Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...   

  main_category      sub_category  \
0    appliances  Air Conditioners   
1    appliances  Air Conditioners   
2    appliances  Air Conditioners   
3    appliances  Air Conditioners   
4    appliances  Air Conditioners   

                                               image  \
0  https://m.media-amazon.com/images/I/31UISB90sY...   
1  https://m.media-amazon.com/images/I/51JFb7FctD...   
2  https://m.media-amazon.com/images/I/51JFb7FctD...   
3  https://m.media-amazon.com/images/I/51JFb7FctD...   
4  https://m.media-amazon.com/images/I/41lrtqXPiW...   

                               

In [14]:
import pandas as pd


df = pd.read_csv('updated_dataset.csv')

df.replace({'₹': ''}, regex=True, inplace=True)

df.to_csv('cleaned_amazon_data.csv', index=False)

In [15]:
df = pd.read_csv('cleaned_amazon_data.csv')

print(df.head())

   Unnamed: 0                                               name  \
0           0  Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...   
1           1  LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...   
2           2  LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...   
3           3  LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...   
4           4  Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...   

  main_category      sub_category  \
0    appliances  Air Conditioners   
1    appliances  Air Conditioners   
2    appliances  Air Conditioners   
3    appliances  Air Conditioners   
4    appliances  Air Conditioners   

                                               image  \
0  https://m.media-amazon.com/images/I/31UISB90sY...   
1  https://m.media-amazon.com/images/I/51JFb7FctD...   
2  https://m.media-amazon.com/images/I/51JFb7FctD...   
3  https://m.media-amazon.com/images/I/51JFb7FctD...   
4  https://m.media-amazon.com/images/I/41lrtqXPiW...   

                               

In [16]:
import pandas as pd

conversion_rate = 83

df = pd.read_csv('cleaned_amazon_data.csv')

df['discount_price'] = df['discount_price'].replace({'₹': '', ',': ''}, regex=True).astype(float)
df['actual_price'] = df['actual_price'].replace({'₹': '', ',': ''}, regex=True).astype(float)

df['discount_price'] = df['discount_price'] / conversion_rate
df['actual_price'] = df['actual_price'] / conversion_rate

df.to_csv('converted_amazon_data.csv', index=False)


In [17]:
df = pd.read_csv('converted_amazon_data.csv')

print(df.head())

   Unnamed: 0                                               name  \
0           0  Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...   
1           1  LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...   
2           2  LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...   
3           3  LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...   
4           4  Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...   

  main_category      sub_category  \
0    appliances  Air Conditioners   
1    appliances  Air Conditioners   
2    appliances  Air Conditioners   
3    appliances  Air Conditioners   
4    appliances  Air Conditioners   

                                               image  \
0  https://m.media-amazon.com/images/I/31UISB90sY...   
1  https://m.media-amazon.com/images/I/51JFb7FctD...   
2  https://m.media-amazon.com/images/I/51JFb7FctD...   
3  https://m.media-amazon.com/images/I/51JFb7FctD...   
4  https://m.media-amazon.com/images/I/41lrtqXPiW...   

                               

In [18]:
import pandas as pd

df = pd.read_csv('converted_amazon_data.csv')

df['discount_price'] = df['discount_price'].round()
df['actual_price'] = df['actual_price'].round()

df.to_csv('rounded_amazon_data.csv', index=False)

print(df.head())


   Unnamed: 0                                               name  \
0           0  Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...   
1           1  LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...   
2           2  LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...   
3           3  LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...   
4           4  Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...   

  main_category      sub_category  \
0    appliances  Air Conditioners   
1    appliances  Air Conditioners   
2    appliances  Air Conditioners   
3    appliances  Air Conditioners   
4    appliances  Air Conditioners   

                                               image  \
0  https://m.media-amazon.com/images/I/31UISB90sY...   
1  https://m.media-amazon.com/images/I/51JFb7FctD...   
2  https://m.media-amazon.com/images/I/51JFb7FctD...   
3  https://m.media-amazon.com/images/I/51JFb7FctD...   
4  https://m.media-amazon.com/images/I/41lrtqXPiW...   

                               

In [19]:
import pandas as pd

# Load your dataset
df = pd.read_csv('rounded_amazon_data.csv')  # replace 'path_to_your_file.csv' with the path to your dataset

# Get unique values from the 'relationship' column
unique_relationships = df['main_category'].unique()

# Print unique values
print(unique_relationships)

['appliances' 'car & motorbike' 'tv, audio & cameras' 'sports & fitness'
 'grocery & gourmet foods' 'home & kitchen' 'pet supplies' 'stores'
 'toys & baby products' "kids' fashion" 'bags & luggage' 'accessories'
 "women's shoes" 'beauty & health' "men's shoes" "women's clothing"
 'industrial supplies' "men's clothing" 'music' 'home, kitchen, pets']


In [20]:
import pandas as pd

df = pd.read_csv('rounded_amazon_data.csv')

category_gender_map = {
    "women's shoes": 'Female',
    "women's clothing": 'Female',
    "men's shoes": 'Male',
    "men's clothing": 'Male'
}


df['Gender'] = df['main_category'].map(category_gender_map).fillna(df['Gender'])


df.loc[(df['main_category'] == 'stores') & (df['sub_category'] == "Men's Fashion"), 'Gender'] = 'Male'


print(df[['main_category', 'sub_category', 'Gender']].head())


df.to_csv('updated_amazon_data.csv', index=False)


  main_category      sub_category  Gender
0    appliances  Air Conditioners  Female
1    appliances  Air Conditioners  Female
2    appliances  Air Conditioners  Female
3    appliances  Air Conditioners  Female
4    appliances  Air Conditioners  Female


In [21]:
print(df.head())

   Unnamed: 0                                               name  \
0           0  Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...   
1           1  LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...   
2           2  LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...   
3           3  LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...   
4           4  Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...   

  main_category      sub_category  \
0    appliances  Air Conditioners   
1    appliances  Air Conditioners   
2    appliances  Air Conditioners   
3    appliances  Air Conditioners   
4    appliances  Air Conditioners   

                                               image  \
0  https://m.media-amazon.com/images/I/31UISB90sY...   
1  https://m.media-amazon.com/images/I/51JFb7FctD...   
2  https://m.media-amazon.com/images/I/51JFb7FctD...   
3  https://m.media-amazon.com/images/I/51JFb7FctD...   
4  https://m.media-amazon.com/images/I/41lrtqXPiW...   

                               

In [22]:
import numpy as np
import pandas as pd

def assign_relationship(row):
    if row['Gender'] == 'Female':
        return np.random.choice(['Wife', 'Sister', 'Daughter', 'Mother', 'For Her', 'Wife'])
    elif row['Gender'] == 'Male':
        return np.random.choice(['Brother', 'Father', 'Son', 'For Him', 'Husband'])
    else:
        neutral_roles = ['Kids', 'Friend', 'Boss', 'Colleague']
        role = np.random.choice(neutral_roles)
        role_gender = np.random.choice(['Male', 'Female'])
        return f"{role} ({role_gender})"

df['Relationship'] = df.apply(assign_relationship, axis=1)

df.to_csv('relationship_updated_amazon_data.csv', index=False)

print(df[['main_category', 'sub_category', 'Gender', 'Relationship']].head())


  main_category      sub_category  Gender Relationship
0    appliances  Air Conditioners  Female         Wife
1    appliances  Air Conditioners  Female         Wife
2    appliances  Air Conditioners  Female         Wife
3    appliances  Air Conditioners  Female       Sister
4    appliances  Air Conditioners  Female     Daughter


In [23]:
df = pd.read_csv('relationship_updated_amazon_data.csv')

print(df.head())

   Unnamed: 0                                               name  \
0           0  Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...   
1           1  LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...   
2           2  LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...   
3           3  LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...   
4           4  Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...   

  main_category      sub_category  \
0    appliances  Air Conditioners   
1    appliances  Air Conditioners   
2    appliances  Air Conditioners   
3    appliances  Air Conditioners   
4    appliances  Air Conditioners   

                                               image  \
0  https://m.media-amazon.com/images/I/31UISB90sY...   
1  https://m.media-amazon.com/images/I/51JFb7FctD...   
2  https://m.media-amazon.com/images/I/51JFb7FctD...   
3  https://m.media-amazon.com/images/I/51JFb7FctD...   
4  https://m.media-amazon.com/images/I/41lrtqXPiW...   

                               

In [24]:
import numpy as np
import pandas as pd

def assign_occasion(row):
    if row['Relationship'] in ['Wife', 'Husband']:
        return 'Anniversary'
    elif row['Relationship'] == 'Mother':
        return "Mother's Day"
    elif row['Relationship'] == 'Father':
        return "Father's Day"
    elif row['Relationship'] in ['Brother', 'Sister']:
        return 'Raksha Bandhan'
    else:
        return np.random.choice(['Birthday', 'Graduation', 'Wedding', 'Retirement'])

# Apply the function to each row
df['Occasion'] = df.apply(assign_occasion, axis=1)

# Save the updated DataFrame to a new CSV file
df.to_csv('occasion_updated_amazon_data.csv', index=False)

# Display the updated DataFrame with the new 'occasion' column
print(df[['Relationship', 'Occasion']].head())


  Relationship        Occasion
0         Wife     Anniversary
1         Wife     Anniversary
2         Wife     Anniversary
3       Sister  Raksha Bandhan
4     Daughter      Graduation


In [25]:
df = pd.read_csv('occasion_updated_amazon_data.csv')

print(df.head())

   Unnamed: 0                                               name  \
0           0  Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...   
1           1  LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...   
2           2  LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...   
3           3  LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...   
4           4  Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...   

  main_category      sub_category  \
0    appliances  Air Conditioners   
1    appliances  Air Conditioners   
2    appliances  Air Conditioners   
3    appliances  Air Conditioners   
4    appliances  Air Conditioners   

                                               image  \
0  https://m.media-amazon.com/images/I/31UISB90sY...   
1  https://m.media-amazon.com/images/I/51JFb7FctD...   
2  https://m.media-amazon.com/images/I/51JFb7FctD...   
3  https://m.media-amazon.com/images/I/51JFb7FctD...   
4  https://m.media-amazon.com/images/I/41lrtqXPiW...   

                               