In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

df = pd.read_csv("/Users/varuniyer/synthacks '24/sample_db.csv")

In [9]:
df['bio'] = df[['interests/skills', 'field', 'location', 'language', 'challenges/expertise', 'supportstyle', 'priceRange']].agg(' '.join, axis=1)

X = df[['bio']]
y = df['cosinesimilarity']

tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf.fit_transform(X['bio'])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

decision_tree = DecisionTreeRegressor(random_state=42)

decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)

In [11]:
mse = mean_squared_error(y_test, y_pred)

# Calculate the R^2 Score
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.0




In [12]:
print(f"R^2 Score: {r2}")

bins = np.linspace(y.min(), y.max(), 10)
y_test_binned = np.digitize(y_test, bins)
y_pred_binned = np.digitize(y_pred, bins)

R^2 Score: nan


In [13]:
accuracy = accuracy_score(y_test_binned, y_pred_binned) * 100
print(f"Accuracy: {accuracy:.2f}%")

feature_importances = decision_tree.feature_importances_
importance_df = pd.DataFrame({
    'Feature': tfidf.get_feature_names_out(),
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

Accuracy: 100.00%


In [14]:
print(importance_df.head(10))

       Feature  Importance
0           20         0.0
1           30         0.0
2       coding         0.0
3  engineering         0.0
4      english         0.0
5     industry         0.0
6     insights         0.0
7    interview         0.0
8      network         0.0
9      virtual         0.0
