# Complete Preprocessing Of Data

In [1]:
import pandas as pd

In [None]:
files = ["/content/drive/MyDrive/sem6-project/datasets/Tshirt.csv",
         "/content/drive/MyDrive/sem6-project/datasets/cosmetic_Product (1).csv", "/content/drive/MyDrive/sem6-project/datasets/flipkartMob11.csv",
         "/content/drive/MyDrive/sem6-project/datasets/menClothing.csv", "/content/drive/MyDrive/sem6-project/datasets/smartphone.csv",
         "/content/drive/MyDrive/sem6-project/datasets/tvDataSet.csv", "/content/drive/MyDrive/sem6-project/datasets/flipcartHeadphones (1).csv",
         "/content/drive/MyDrive/sem6-project/datasets/flipcartTV.csv"]


In [None]:
columns_map = {
    'productName': 'productName',
    "name": 'productName',
    'phoneName': 'productName',
    'tvname': 'productName',
    'description': 'description',
    'productDescription': 'description',
    'question': 'question',
    'answer': 'answer'
}

In [None]:
product_list = []
for file in files:
  df = pd.read_csv(file)
  print(df.shape)
  df.rename(columns=columns_map, inplace=True)
  # print(df.columns)
  if 'description' not in df.columns:
    df['description'] = 'No description found!'
  temp = df[['productName','description', 'question', 'answer']]
  product_list.append(temp)

combined_dataset = pd.concat(product_list, ignore_index=True)

(849, 11)
(6925, 12)
(2139, 3)
(1147, 11)
(1345, 3)
(230, 3)
(5084, 12)
(1803, 14)


In [None]:
combined_dataset.columns


Index(['productName', 'description', 'question', 'answer'], dtype='object')

In [None]:
combined_dataset.shape

(19522, 4)

In [None]:
combined_dataset.head()

Unnamed: 0,productName,description,question,answer
0,Men Printed Polo Neck Poly Cotton (220 gsm) Re...,No description found!,,
1,Men Colorblock Polo Neck Cotton Blend Multicol...,No description found!,,
2,"Men Printed, Typography Round Neck Pure Cotton...",No description found!,My height 5.4 size suggestion...?,Xl chest 42 langth 29
3,"Men Printed, Typography Round Neck Pure Cotton...",No description found!,How's is product please tell me?,It's good 100% cotton febric
4,"Men Printed, Typography Round Neck Pure Cotton...",No description found!,What is gsm of this t shirt,180 gsm cotton febric


We have our all 4 required fileds, now handle missing values of each columns with a custom values

In [None]:
# write a code to handle/replace missing values of each column with a custome value
combined_dataset.isnull().sum()

Unnamed: 0,0
productName,0
description,2332
question,427
answer,428


In [None]:
# Define custom values for each column
custom_values = {
    'description': 'No description available',
    'answer': 'No answer available'
}

# Fill missing values using the custom values
combined_dataset = combined_dataset.fillna(custom_values)


In [None]:
combined_dataset.isnull().sum()

Unnamed: 0,0
productName,0
description,0
question,427
answer,0


In [None]:
combined_dataset.dropna(inplace=True)

In [None]:
combined_dataset.isnull().sum()

Unnamed: 0,0
productName,0
description,0
question,0
answer,0


In [None]:
combined_dataset.shape

(19095, 4)

Download and store final combined data for next step

In [None]:
combined_dataset.to_csv("/content/drive/MyDrive/sem6-project/datasets/CombinedDataset.csv", index=False)

This whole operation gave us a csv file with data having 4 columns i.e. productName, dscription, question and answer.

This is refined data with no missing values. Now lets proccede to tokenization and training.

## Review data

In [2]:
review_files = ["/content/drive/MyDrive/sem6-project/datasets/raw_data/castomatic_review.csv",
                "/content/drive/MyDrive/sem6-project/datasets/raw_data/headphone_review.csv",
                "/content/drive/MyDrive/sem6-project/datasets/raw_data/mobile_review.csv"]

review_cloumn_map = {
    "product_name": 'productName',
    "review_text": "review",
    "productname": "productName",
    "reviewtext": "review"
}

In [3]:
review_list = []

for file in review_files:
  df = pd.read_csv(file)
  print(df.shape)
  print(df.columns)
  df.rename(columns=review_cloumn_map, inplace=True)
  print(df.columns)
  temp = df[['productName', 'review']]
  review_list.append(temp)

review_dataset = pd.concat(review_list, ignore_index=True)
  # print(df.head())

(2617, 13)
Index(['web-scraper-order', 'web-scraper-start-url', 'newpage', 'newpage-href',
       'product', 'product-href', 'product_name', 'product_review',
       'product_review-href', 'next_review', 'next_review-href', 'review_text',
       'review_star'],
      dtype='object')
Index(['web-scraper-order', 'web-scraper-start-url', 'newpage', 'newpage-href',
       'product', 'product-href', 'productName', 'product_review',
       'product_review-href', 'next_review', 'next_review-href', 'review',
       'review_star'],
      dtype='object')
(15728, 13)
Index(['web-scraper-order', 'web-scraper-start-url', 'productlink',
       'productlink-href', 'newpage', 'newpage-href', 'productname',
       'allreviewlink', 'allreviewlink-href', 'newreview', 'newreview-href',
       'reviewtext', 'reviewstar'],
      dtype='object')
Index(['web-scraper-order', 'web-scraper-start-url', 'productlink',
       'productlink-href', 'newpage', 'newpage-href', 'productName',
       'allreviewlink', 'all

In [4]:
review_dataset.head(n=10)

Unnamed: 0,productName,review
0,SBL Silk' n Stay Aloevera Cream 100g each [Pac...,Oily.
1,SBL Silk' n Stay Aloevera Cream 100g each [Pac...,Nyce
2,SBL Silk' n Stay Aloevera Cream 100g each [Pac...,Very good
3,SBL Silk' n Stay Aloevera Cream 100g each [Pac...,Very good
4,SBL Silk' n Stay Aloevera Cream 100g each [Pac...,Osum cream bt not for whiting
5,SBL Silk' n Stay Aloevera Cream 100g each [Pac...,Baje cream don't waist Manny
6,SBL Silk' n Stay Aloevera Cream 100g each [Pac...,Amazing
7,SBL Silk' n Stay Aloevera Cream 100g each [Pac...,Nice
8,SBL Silk' n Stay Aloevera Cream 100g each [Pac...,Good product
9,SBL Silk' n Stay Aloevera Cream 100g each [Pac...,It's a nice


In [5]:
review_dataset.shape

(27939, 2)

In [6]:
review_dataset.columns

Index(['productName', 'review'], dtype='object')

In [7]:
review_dataset.isnull().sum()

Unnamed: 0,0
productName,0
review,16


In [8]:
review_dataset.dropna(inplace=True)

In [9]:
review_dataset.isnull().sum()

Unnamed: 0,0
productName,0
review,0


In [10]:
review_dataset.to_csv("/content/drive/MyDrive/sem6-project/datasets/CombinedReviewDataset.csv", index=False)