In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

In [2]:

# Пути к файлам данных
train_data_path = './prod_count/producthunt_train.csv'
test_data_path = './prod_count/producthunt_test.csv'

# Загрузка данных
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)


In [3]:
# Замена пропущенных значений
train_data.fillna({'topics': 'Unknown'}, inplace=True)
test_data.fillna({'name': 'Unknown', 'tagline': 'Unknown', 'topics': 'Unknown'}, inplace=True)

In [35]:
test_data

Unnamed: 0,id,slug,name,tagline,commentsCount,dateAdded,timeAdded,topics
0,56702,the-trip-tribe,The Trip Tribe,Book travel with others that share your interests,5,2015-08-16,12:36:47,"Web App, Tech"
1,56703,superpersonal,Superpersonal,Personalised styling and virtual fitting room,1,2019-09-08,08:11:45,"Android, iOS, Fashion, Artificial Intelligence..."
2,56704,savemytime-time-tracker,SaveMyTime - Time Tracker,A new way to track time and analyse productivity,33,2016-09-19,09:03:54,"Android, Productivity, Tech"
3,56705,atheena,Atheena,Wall Street experts in your pocket,11,2021-04-17,07:01:00,"Web App, Fintech, Investing"
4,56706,spotiapp,SpotiApp,Export music to Spotify from any musical services,38,2020-03-02,08:00:00,"iOS, Music, Spotify, Developer Tools, Apple"
...,...,...,...,...,...,...,...,...
12703,69405,blend-bde06a5f-67d8-4677-a372-f14d4c6ba39d,Blend,Make simple gradient wallpapers for your iOS D...,4,2016-09-18,18:52:00,"iOS, iPad, Design Tools, Tech, Wallpaper"
12704,69406,mailchimp-product-recommendations,MailChimp Product Recommendations,Easily build automated personalized email mark...,1,2016-06-06,13:46:43,"Email Marketing, Tech"
12705,69407,hacking-human-nature-for-good,Hacking Human Nature for Good,How to apply behavioral economics when buildin...,14,2015-11-20,16:20:22,"Web App, Prototyping, Tech"
12706,69408,just-get-me-food,Just Get Me Food,Don't know where to eat? Problem solved.,12,2018-08-17,07:00:00,"Android, iOS, Web App, Health & Fitness"


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Объединение данных для TF-IDF
combined_data = pd.concat([train_data, test_data], axis=0)

# TF-IDF векторизатор
tfidf_vectorizer = TfidfVectorizer(max_features=100)

# Применение TF-IDF к 'tagline'
tfidf_tagline = tfidf_vectorizer.fit_transform(combined_data['tagline'])

# Применение TF-IDF к 'topics'
tfidf_topics = tfidf_vectorizer.fit_transform(combined_data['topics'])


In [5]:
# Преобразование TF-IDF матриц в DataFrame
tfidf_tagline_df = pd.DataFrame(tfidf_tagline.toarray(), columns=[f'tagline_tfidf_{i}' for i in range(100)])
tfidf_topics_df = pd.DataFrame(tfidf_topics.toarray(), columns=[f'topics_tfidf_{i}' for i in range(100)])


In [6]:
import datetime
now = datetime.datetime.now()


In [7]:
def get_number_of_days(features):
    
    datetime_col = pd.to_datetime(features["dateAdded"] + " " + features["timeAdded"])
    
    return (now - datetime_col).map(lambda x: x.days)

In [45]:
test_data

Unnamed: 0,id,slug,name,tagline,commentsCount,dateAdded,timeAdded,topics
0,56702,the-trip-tribe,The Trip Tribe,Book travel with others that share your interests,5,2015-08-16,12:36:47,"Web App, Tech"
1,56703,superpersonal,Superpersonal,Personalised styling and virtual fitting room,1,2019-09-08,08:11:45,"Android, iOS, Fashion, Artificial Intelligence..."
2,56704,savemytime-time-tracker,SaveMyTime - Time Tracker,A new way to track time and analyse productivity,33,2016-09-19,09:03:54,"Android, Productivity, Tech"
3,56705,atheena,Atheena,Wall Street experts in your pocket,11,2021-04-17,07:01:00,"Web App, Fintech, Investing"
4,56706,spotiapp,SpotiApp,Export music to Spotify from any musical services,38,2020-03-02,08:00:00,"iOS, Music, Spotify, Developer Tools, Apple"
...,...,...,...,...,...,...,...,...
12703,69405,blend-bde06a5f-67d8-4677-a372-f14d4c6ba39d,Blend,Make simple gradient wallpapers for your iOS D...,4,2016-09-18,18:52:00,"iOS, iPad, Design Tools, Tech, Wallpaper"
12704,69406,mailchimp-product-recommendations,MailChimp Product Recommendations,Easily build automated personalized email mark...,1,2016-06-06,13:46:43,"Email Marketing, Tech"
12705,69407,hacking-human-nature-for-good,Hacking Human Nature for Good,How to apply behavioral economics when buildin...,14,2015-11-20,16:20:22,"Web App, Prototyping, Tech"
12706,69408,just-get-me-food,Just Get Me Food,Don't know where to eat? Problem solved.,12,2018-08-17,07:00:00,"Android, iOS, Web App, Health & Fitness"


In [46]:
tfidf_tagline_df.iloc[len(train_data):].reset_index()

Unnamed: 0,index,tagline_tfidf_0,tagline_tfidf_1,tagline_tfidf_2,tagline_tfidf_3,tagline_tfidf_4,tagline_tfidf_5,tagline_tfidf_6,tagline_tfidf_7,tagline_tfidf_8,...,tagline_tfidf_90,tagline_tfidf_91,tagline_tfidf_92,tagline_tfidf_93,tagline_tfidf_94,tagline_tfidf_95,tagline_tfidf_96,tagline_tfidf_97,tagline_tfidf_98,tagline_tfidf_99
0,56702,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,56703,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,56704,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.471666,0.0,0.0,0.0,0.0
3,56705,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,56706,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12703,69405,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
12704,69406,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
12705,69407,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
12706,69408,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [47]:
tfidf_topics_df.iloc[len(train_data):].reset_index()

Unnamed: 0,index,topics_tfidf_0,topics_tfidf_1,topics_tfidf_2,topics_tfidf_3,topics_tfidf_4,topics_tfidf_5,topics_tfidf_6,topics_tfidf_7,topics_tfidf_8,...,topics_tfidf_90,topics_tfidf_91,topics_tfidf_92,topics_tfidf_93,topics_tfidf_94,topics_tfidf_95,topics_tfidf_96,topics_tfidf_97,topics_tfidf_98,topics_tfidf_99
0,56702,0.0,0.0,0.000000,0.0,0.652842,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.649247,0.0,0.0,0.0,0.0
1,56703,0.0,0.0,0.391107,0.0,0.000000,0.000000,0.0,0.0,0.378875,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,56704,0.0,0.0,0.776694,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,56705,0.0,0.0,0.000000,0.0,0.352199,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.350259,0.0,0.0,0.0,0.0
4,56706,0.0,0.0,0.000000,0.0,0.000000,0.623229,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12703,69405,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
12704,69406,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
12705,69407,0.0,0.0,0.000000,0.0,0.392643,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.390480,0.0,0.0,0.0,0.0
12706,69408,0.0,0.0,0.416755,0.0,0.336427,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.334574,0.0,0.0,0.0,0.0


In [48]:
pd.concat([test_data, 
           tfidf_tagline_df.iloc[len(train_data):].reset_index(), 
           tfidf_topics_df.iloc[len(train_data):].reset_index()], axis=1)

Unnamed: 0,id,slug,name,tagline,commentsCount,dateAdded,timeAdded,topics,index,tagline_tfidf_0,...,topics_tfidf_90,topics_tfidf_91,topics_tfidf_92,topics_tfidf_93,topics_tfidf_94,topics_tfidf_95,topics_tfidf_96,topics_tfidf_97,topics_tfidf_98,topics_tfidf_99
0,56702,the-trip-tribe,The Trip Tribe,Book travel with others that share your interests,5,2015-08-16,12:36:47,"Web App, Tech",56702,0.0,...,0.0,0.0,0.0,0.0,0.0,0.649247,0.0,0.0,0.0,0.0
1,56703,superpersonal,Superpersonal,Personalised styling and virtual fitting room,1,2019-09-08,08:11:45,"Android, iOS, Fashion, Artificial Intelligence...",56703,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,56704,savemytime-time-tracker,SaveMyTime - Time Tracker,A new way to track time and analyse productivity,33,2016-09-19,09:03:54,"Android, Productivity, Tech",56704,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,56705,atheena,Atheena,Wall Street experts in your pocket,11,2021-04-17,07:01:00,"Web App, Fintech, Investing",56705,0.0,...,0.0,0.0,0.0,0.0,0.0,0.350259,0.0,0.0,0.0,0.0
4,56706,spotiapp,SpotiApp,Export music to Spotify from any musical services,38,2020-03-02,08:00:00,"iOS, Music, Spotify, Developer Tools, Apple",56706,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12703,69405,blend-bde06a5f-67d8-4677-a372-f14d4c6ba39d,Blend,Make simple gradient wallpapers for your iOS D...,4,2016-09-18,18:52:00,"iOS, iPad, Design Tools, Tech, Wallpaper",69405,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
12704,69406,mailchimp-product-recommendations,MailChimp Product Recommendations,Easily build automated personalized email mark...,1,2016-06-06,13:46:43,"Email Marketing, Tech",69406,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
12705,69407,hacking-human-nature-for-good,Hacking Human Nature for Good,How to apply behavioral economics when buildin...,14,2015-11-20,16:20:22,"Web App, Prototyping, Tech",69407,0.0,...,0.0,0.0,0.0,0.0,0.0,0.390480,0.0,0.0,0.0,0.0
12706,69408,just-get-me-food,Just Get Me Food,Don't know where to eat? Problem solved.,12,2018-08-17,07:00:00,"Android, iOS, Web App, Health & Fitness",69408,0.0,...,0.0,0.0,0.0,0.0,0.0,0.334574,0.0,0.0,0.0,0.0


In [54]:
# Объединение TF-IDF признаков с исходными данными
# Предположим, что другие числовые и категориальные признаки уже обработаны
train_features = pd.concat([train_data, tfidf_tagline_df.iloc[:len(train_data)], tfidf_topics_df.iloc[:len(train_data)]], axis=1)
test_features = pd.concat([test_data, 
           tfidf_tagline_df.iloc[len(train_data):].reset_index(drop=True), 
           tfidf_topics_df.iloc[len(train_data):].reset_index(drop=True)], axis=1)
# Выделение целевой переменной
y_train = train_features['votesCount']

test_features["total_days"] = get_number_of_days(test_features)
train_features["total_days"] = get_number_of_days(train_features)


# # Удаление ненужных столбцов
train_features.drop(['id', 'slug', 'name', 'tagline', 'topics', 'votesCount', "dateAdded", "timeAdded"], axis=1, inplace=True)
test_features.drop(['id', 'slug', 'name', 'tagline', 'topics', "dateAdded", "timeAdded"], axis=1, inplace=True)


In [52]:
test_features

Unnamed: 0,commentsCount,index,tagline_tfidf_0,tagline_tfidf_1,tagline_tfidf_2,tagline_tfidf_3,tagline_tfidf_4,tagline_tfidf_5,tagline_tfidf_6,tagline_tfidf_7,...,topics_tfidf_91,topics_tfidf_92,topics_tfidf_93,topics_tfidf_94,topics_tfidf_95,topics_tfidf_96,topics_tfidf_97,topics_tfidf_98,topics_tfidf_99,total_days
0,5,56702,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.649247,0.0,0.0,0.0,0.0,3030
1,1,56703,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1546
2,33,56704,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,2630
3,11,56705,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.350259,0.0,0.0,0.0,0.0,959
4,38,56706,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12703,4,69405,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,2630
12704,1,69406,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,2735
12705,14,69407,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.390480,0.0,0.0,0.0,0.0,2933
12706,12,69408,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.334574,0.0,0.0,0.0,0.0,1933


In [16]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Параметры для поиска по сетке
param_grid = {
    'max_depth': [7],
    # 'learning_rate': [0.01, 0.1],
    'n_estimators': [1000, 10000],
    # 'subsample': [0.8, 1],
    # 'colsample_bytree': [0.8, 1],
}

# Инициализация модели
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Поиск по сетке
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=10)
grid_search.fit(train_features, y_train)

# Лучшие параметры
best_params = grid_search.best_params_
print("Лучшие параметры:", best_params)


Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 2/3; 1/2] START max_depth=7, n_estimators=1000..............................
[CV 1/3; 1/2] START max_depth=7, n_estimators=1000..............................
[CV 2/3; 2/2] START max_depth=7, n_estimators=10000.............................
[CV 3/3; 1/2] START max_depth=7, n_estimators=1000..............................
[CV 3/3; 2/2] START max_depth=7, n_estimators=10000.............................
[CV 1/3; 2/2] START max_depth=7, n_estimators=10000.............................
[CV 1/3; 1/2] END max_depth=7, n_estimators=1000;, score=0.460 total time= 1.8min
[CV 2/3; 1/2] END max_depth=7, n_estimators=1000;, score=0.462 total time= 1.9min
[CV 3/3; 1/2] END max_depth=7, n_estimators=1000;, score=0.488 total time= 1.9min
[CV 2/3; 2/2] END max_depth=7, n_estimators=10000;, score=0.449 total time=12.2min
[CV 1/3; 2/2] END max_depth=7, n_estimators=10000;, score=0.444 total time=12.2min
[CV 3/3; 2/2] END max_depth=7, n_estimators

In [57]:
# Лучшая модель
best_model = grid_search.best_estimator_

# Прогнозы для тестового набора
predictions = best_model.predict(test_features)

predictions_df = pd.DataFrame()
predictions_df["id"] = test_data["id"]
predictions_df["votesCount"] = predictions
predictions_df.to_csv("startup_count.csv", index=False)