In [2]:
import pandas as pd
import numpy as np

In [3]:
#Loading in data sets
rubber = pd.read_csv('data/Rubbers.csv')
blade = pd.read_csv('data/Blades.csv')

### Rubber Dataset

In [4]:
rubber.head()

Unnamed: 0,Product Name,Price,Spin,Speed,Control
0,729 729-08ES,$22.00,120.0,110.0,120.0
1,729 Top Point,$18.00,110.0,120.0,110.0
2,Andro Backside 20D,$29.99,103.0,85.0,97.0
3,Andro Chaos,$29.99,73.0,75.0,87.0
4,Andro GTT 40,$24.99,101.0,96.0,92.0


In [9]:
# Null Values:
def find_null(data):
    null = {}
    for column in data:
        null[column] = data[column].isnull().value_counts()
    null_values = pd.DataFrame(data = null)
    null_values = null_values.fillna(0)
    return null_values
print('False = Not Null count')
print('True = Null count')
find_null(rubber)

False = Not Null count
True = Null count


Unnamed: 0,Product Name,Price,Spin,Speed,Control
False,348.0,348.0,329,339,233
True,0.0,0.0,19,9,115


In [10]:
#Rename Product Name Column
rubber = rubber.rename(columns = {'Product Name': 'Product_Name'})

#Remove dollar sign in Price column
rubber['Price'] = rubber['Price'].apply(lambda x: x.strip('$'))
rubber['Price'] = rubber['Price'].astype(float)

#Dealing with NA values
rubber['Spin'] = rubber['Spin'].fillna(rubber['Spin'].mean())
rubber['Speed'] = rubber['Speed'].fillna(rubber['Speed'].mean())
rubber['Control'] = rubber['Control'].fillna(rubber['Control'].mean())

#Product_Name Issues
rubber['Product_Name'] = rubber['Product_Name'].apply(lambda x: x.replace('\xa0\xa0B\xa0', " ")) 
rubber['Product_Name'] = rubber['Product_Name'].apply(lambda x: x.replace('\xa0\xa0N\xa0', " "))

rubber.tail()

Unnamed: 0,Product_Name,Price,Spin,Speed,Control
343,Yasaka Valmo,29.95,90.0,99.0,86.0
344,Yasaka X-Tend HS,34.95,98.0,92.0,76.0
345,Yasaka X-Tend LB,14.95,120.0,110.0,80.0
346,Yasaka X-Tend PO,14.95,65.0,92.0,76.0
347,Yasaka X-Tend SD,33.95,120.0,110.0,80.0


In [11]:
print('Features:')
print('Product Name: The name of the specific paddle rubber. Ex: {}'.format(
rubber['Product_Name'][3]))
print('Price: The cost of the rubber in dollars. The data ranges from {} to {}'.format(
rubber['Price'].min(), rubber['Price'].max()))
print('Spin: The spin rating for the rubber. The data ranges from {} to {}'.format(
rubber['Spin'].min(), rubber['Spin'].max()))
print('Speed: The speed rating for the rubber. The data ranges from {} to {}'.format(
rubber['Speed'].min(), rubber['Speed'].max()))
print('Control: The Control rating for the rubber. The data ranges from {} to {}'.format(
rubber['Control'].min(), rubber['Control'].max()))
print('\n')

print('Summary Statistics for Ping Pong Rubbers:')
print(rubber.describe())
print('\n')
print()

Features:
Product Name: The name of the specific paddle rubber. Ex: Andro Chaos
Price: The cost of the rubber in dollars. The data ranges from 9.95 to 93.99
Spin: The spin rating for the rubber. The data ranges from 0.0 to 140.0
Speed: The speed rating for the rubber. The data ranges from 30.0 to 182.0
Control: The Control rating for the rubber. The data ranges from 35.0 to 120.0


Summary Statistics for Ping Pong Rubbers:
            Price        Spin       Speed     Control
count  348.000000  348.000000  348.000000  348.000000
mean    40.018161   94.747720   98.041298   80.742489
std     15.424764   21.649158   25.514951   10.949261
min      9.950000    0.000000   30.000000   35.000000
25%     29.950000   90.000000   89.000000   74.000000
50%     39.900000   97.000000   96.500000   80.742489
75%     49.950000  107.000000  113.000000   85.000000
max     93.990000  140.000000  182.000000  120.000000





In [7]:
#Saving cleaned dataset to csv
rubber.to_csv(r'C:\Users\alida\OneDrive\Desktop\Ping_Pong\data\Rubbers_new.csv', index = False)

### Blade Dataset

In [8]:
blade.head()

Unnamed: 0,Product Name,Price,Speed,Control,Weight (g)
0,Andro Blax All+,$35.99,85.0,95.0,73.0
1,Andro Blax Off,$35.99,92.0,92.0,78.0
2,Andro CS7 Pro,$39.99,92.0,94.0,90.0
3,Andro CS7 Tour,$39.99,94.0,93.0,90.0
4,Andro CS7 Velocity,$39.99,96.0,92.0,90.0


In [9]:
print('False = Not Null count')
print('True = Null count')
find_null(blade)

False = Not Null count
True = Null count


Unnamed: 0,Product Name,Price,Speed,Control,Weight (g)
False,355.0,355.0,242,237,334
True,0.0,0.0,113,118,21


In [10]:
#Rename Product Name Column
blade = blade.rename(columns = {'Product Name': 'Product_Name'})

#Remove dollar sign in Price column
blade['Price'] = blade['Price'].apply(lambda x: x.strip('$'))
blade['Price'] = blade['Price'].astype(float)

#Dealing with NA values
blade['Weight (g)'] = blade['Weight (g)'].fillna(blade['Weight (g)'].mean())
blade['Speed'] = blade['Speed'].fillna(blade['Speed'].mean())
blade['Control'] = blade['Control'].fillna(blade['Control'].mean())

#Product_Name Issues
blade['Product_Name'] = blade['Product_Name'].apply(lambda x: x.replace('\xa0\xa0B\xa0', " ")) 
blade['Product_Name'] = blade['Product_Name'].apply(lambda x: x.replace('\xa0\xa0N\xa0', " "))

blade.tail()

Unnamed: 0,Product_Name,Price,Speed,Control,Weight (g)
350,Yasaka Sweden Extra,35.95,82.0,71.0,84.0
351,Yasaka Sweden Extra - Chinese Penhold,35.95,82.0,71.0,86.0
352,Yasaka Sweden Guardian,42.95,73.0,93.0,85.0
353,Yasaka Synergy,49.95,98.0,69.0,82.0
354,Yasaka Synergy - Chinese Penhold,39.95,98.0,69.0,82.0


In [11]:
print('Features:')
print('Product Name: The name of the specific ping pong blade. Ex: {}'.format(
blade['Product_Name'][3]))
print('Price: The cost of the blade in dollars. The data ranges from {} to {}'.format(
blade['Price'].min(), blade['Price'].max()))
print('Speed: The speed rating for the blade. The data ranges from {} to {}'.format(
blade['Speed'].min(), blade['Speed'].max()))
print('Control: The Control rating for the blade. The data ranges from {} to {}'.format(
blade['Control'].min(), blade['Control'].max()))
print('Weight(g): The weight in grams of the blade. The data ranges from {} to {}'.format(
blade['Weight (g)'].min(), blade['Weight (g)'].max()))
print('\n')

print('Summary Statistics for Ping Pong Rubbers:')
print(blade.describe())
print('\n')
print()

Features:
Product Name: The name of the specific ping pong blade. Ex: Andro CS7 Tour
Price: The cost of the blade in dollars. The data ranges from 8.95 to 599.99
Speed: The speed rating for the blade. The data ranges from 50.0 to 113.0
Control: The Control rating for the blade. The data ranges from 40.0 to 104.0
Weight(g): The weight in grams of the blade. The data ranges from 59.0 to 103.0


Summary Statistics for Ping Pong Rubbers:
            Price       Speed     Control  Weight (g)
count  355.000000  355.000000  355.000000  355.000000
mean    95.601014   87.495868   78.101266   85.344311
std     80.689263    8.993798   10.482373    5.875210
min      8.950000   50.000000   40.000000   59.000000
25%     44.445000   86.000000   74.000000   83.000000
50%     65.950000   87.495868   78.101266   85.344311
75%    109.990000   92.000000   82.500000   90.000000
max    599.990000  113.000000  104.000000  103.000000





In [12]:
blade.to_csv(r'C:\Users\alida\OneDrive\Desktop\Ping_Pong\data\Blades_new.csv', index = False)