In [24]:
import pandas as pd

# Load the CSV into a DataFrame
df = pd.read_csv("db.csv")

# Change data types
df["vendor_id"] = df["vendor_id"].astype(int)
df["quantity"] = df["quantity"].astype(int)
bool_columns = ["ISO_9001", "OEKO_TEX_Standard_100", "GOTS", "ISO_14001", "Fair_Trade_Certified", "REACH", "SA8000"]
df[bool_columns] = df[bool_columns].astype(int)
df["latitude"] = df["latitude"].astype(float)
df["longitude"] = df["longitude"].astype(float)
df["item_price"] = df["item_price"].astype(float)
df["item_quantity_available"] = df["item_quantity_available"].astype(int)

# Display the DataFrame
print(df)

     vendor_id             name  quantity  ISO_9001  OEKO_TEX_Standard_100  \
0          101   CottonCrafters       100         1                      1   
1          102        WoolWorks       250         0                      1   
2          103     JuteJunction        80         0                      0   
3          104        YarnYield       150         0                      0   
4          105      LeatherLine       120         1                      1   
..         ...              ...       ...       ...                    ...   
145         96      FiberFusion       200         0                      0   
146         97        NylonNest       100         1                      1   
147         98  SyntheticSource       150         0                      0   
148         99      LeatherLine       200         1                      0   
149        100    FabricFactory       180         0                      0   

     GOTS  ISO_14001  Fair_Trade_Certified  REACH  SA8000  lati

In [25]:
numeric_columns = ["quantity", "latitude", "longitude", "item_price", "item_quantity_available"]
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Standard Scaling
'''standard_scaler = StandardScaler()
df[numeric_columns] = standard_scaler.fit_transform(df[numeric_columns])'''
min_max_scaler = MinMaxScaler()
df[numeric_columns] = min_max_scaler.fit_transform(df[numeric_columns])
df.head()

Unnamed: 0,vendor_id,name,quantity,ISO_9001,OEKO_TEX_Standard_100,GOTS,ISO_14001,Fair_Trade_Certified,REACH,SA8000,latitude,longitude,item_name,item_price,item_quantity_available
0,101,CottonCrafters,0.117647,1,1,0,0,0,0,0,0.823103,0.546748,Cotton,0.235294,0.2
1,102,WoolWorks,1.0,0,1,1,0,0,0,0,0.638022,0.0,Wool,0.823529,1.0
2,103,JuteJunction,0.0,0,0,0,1,1,0,0,0.0,0.43788,Jute,0.352941,0.0
3,104,YarnYield,0.411765,0,0,1,0,0,1,0,0.393561,1.0,Yarn,0.117647,0.466667
4,105,LeatherLine,0.235294,1,1,0,0,0,0,0,1.0,0.007969,Leather,0.0,0.266667


In [36]:
import pandas as pd
df1 = pd.get_dummies(df, columns=['item_name'])

# Select only the dummy columns
dummy_columns = df1.columns[df1.columns.str.startswith('item_name_')]
df1 = df1[dummy_columns]
df1 = df1.astype(int)

# Sum the one-hot encoded columns along the rows
encoded_item_name = df1.apply(lambda row: ''.join(map(str, row)), axis=1)

# Add the new column to the original DataFrame
df['encoded_item_name'] = encoded_item_name

df = df.drop('name', axis=1)
# Display the DataFrame
print(df)


     vendor_id  quantity  ISO_9001  OEKO_TEX_Standard_100  GOTS  ISO_14001  \
0          101  0.117647         1                      1     0          0   
1          102  1.000000         0                      1     1          0   
2          103  0.000000         0                      0     0          1   
3          104  0.411765         0                      0     1          0   
4          105  0.235294         1                      1     0          0   
..         ...       ...       ...                    ...   ...        ...   
145         96  0.705882         0                      0     1          1   
146         97  0.117647         1                      1     0          0   
147         98  0.411765         0                      0     1          1   
148         99  0.705882         1                      0     0          0   
149        100  0.588235         0                      0     1          1   

     Fair_Trade_Certified  REACH  SA8000  latitude  longitude  

In [39]:
vectors = df.to_numpy()

print(vectors)

[[101 0.11764705882352944 1 ... 0.23529411764705888 0.19999999999999996
  '100000000']
 [102 0.9999999999999999 0 ... 0.8235294117647058 1.0 '000000010']
 [103 0.0 0 ... 0.3529411764705882 0.0 '001000000']
 ...
 [98 0.4117647058823529 0 ... 0.7058823529411764 0.33333333333333337
  '000001000']
 [99 0.7058823529411765 1 ... 1.0 0.4666666666666667 '000100000']
 [100 0.5882352941176471 0 ... 0.588235294117647 0.6666666666666666
  '000000100']]


In [42]:
from math import sqrt
def square_rooted(x):
   return round(sqrt(sum([a*a for a in x])),3)

def cosine_similarity(x,y):
 numerator = sum(a*b for a,b in zip(x,y))
 denominator = square_rooted(x)*square_rooted(y)
 return round(numerator/float(denominator),3)

print(cosine_similarity([3, 45, 7, 2], [2, 54, 13, 15]))

0.972
