In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv(r"C:\Users\adity\Downloads\datasets.csv")
df.head()  # Display the first few rows


Unnamed: 0,datasetName,about,link,categoryName,cloud,vintage
0,Microbiome Project,American Gut (Microbiome Project),https://github.com/biocore/American-Gut,Biology,GitHub,
1,GloBI,Global Biotic Interactions (GloBI),https://github.com/jhpoelen/eol-globi-data/wik...,Biology,GitHub,
2,Global Climate,Global Climate Data Since 1929,http://en.tutiempo.net/climate,Climate/Weather,,1929.0
3,CommonCraw 2012,3.5B Web Pages from CommonCraw 2012,http://www.bigdatanews.com/profiles/blogs/big-...,Computer Networks,,2012.0
4,Indiana Webclicks,53.5B Web clicks of 100K users in Indiana Univ.,http://cnets.indiana.edu/groups/nan/webtraffic...,Computer Networks,,


In [3]:
df.fillna("", inplace=True)
df["combined_text"] = df["datasetName"] + " " + df["about"] + " " + df["categoryName"]


  df.fillna("", inplace=True)


In [4]:
label_encoder = LabelEncoder()
df["category_encoded"] = label_encoder.fit_transform(df["categoryName"])


In [5]:
scaler = StandardScaler()
df[["category_encoded"]] = scaler.fit_transform(df[["category_encoded"]])


In [9]:
# Encode dataset names to numerical labels
dataset_encoder = LabelEncoder()
df["dataset_encoded"] = dataset_encoder.fit_transform(df["datasetName"])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df[["category_encoded"]], df["dataset_encoded"], test_size=0.2, random_state=42
)


In [10]:
model = Sequential([
    Embedding(input_dim=len(df["categoryName"].unique()) + 1, output_dim=16),
    LSTM(32, return_sequences=True),
    LSTM(16),
    Dense(8, activation='relu'),
    Dense(len(df["datasetName"].unique()), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [11]:
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test))


Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 459ms/step - accuracy: 0.0000e+00 - loss: 4.1112 - val_accuracy: 0.0000e+00 - val_loss: 4.1134
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.0339 - loss: 4.1101 - val_accuracy: 0.0000e+00 - val_loss: 4.1163
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step - accuracy: 0.0182 - loss: 4.1092 - val_accuracy: 0.0000e+00 - val_loss: 4.1193
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.0182 - loss: 4.1083 - val_accuracy: 0.0000e+00 - val_loss: 4.1223
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.0182 - loss: 4.1076 - val_accuracy: 0.0000e+00 - val_loss: 4.1253
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - accuracy: 0.0339 - loss: 4.1065 - val_accuracy: 0.0000e+00 - val_loss: 4.1284
Epoch 7/10
[1m3/

<keras.src.callbacks.history.History at 0x1eb062a2ba0>

In [16]:
from difflib import get_close_matches

def find_closest_category(user_input, categories):
    matches = get_close_matches(user_input, categories, n=1, cutoff=0.4)
    return matches[0] if matches else None

def recommend_datasets(user_input):
    # Find closest matching category
    closest_category = find_closest_category(user_input, label_encoder.classes_)
    
    if closest_category is None:
        return "No related datasets found."

    user_vec = label_encoder.transform([closest_category])  # Use closest match
    user_vec = scaler.transform([user_vec])
    
    predictions = model.predict(user_vec)
    top_indices = predictions.argsort()[0][-5:][::-1]

    recommended_datasets = df.iloc[top_indices][["datasetName", "about", "link", "categoryName"]]

    if recommended_datasets.empty:
        return "No related datasets found."
    
    return recommended_datasets

# Example usage
user_domain = input("Enter your domain: ")
recommendations = recommend_datasets(user_domain)
print(recommendations)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
        datasetName                                         about  \
56   ClueWeb12 FACC                                ClueWeb12 FACC   
35        Formula 1  Ergast Formula 1, from 1950 up to date (API)   
37      Airlines OD                    Airlines OD Data 1987-2008   
13  World countries           World countries in multiple formats   
8   Localytics Data       Localytics Data Visualization Challenge   

                                                 link      categoryName  
56           http://lemurproject.org/clueweb12/FACC1/  Natural Language  
35                           http://ergast.com/mrd/db            Sports  
37  http://stat-computing.org/dataexpo/2009/the-da...    Transportation  
13               https://github.com/mledoze/countries               GIS  
8    https://github.com/localytics/data-viz-challenge   Data Challenges  


