In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.svm import SVC
import optuna

In [None]:
train=pd.read_csv('train.csv')
train=train.set_index('id')
train=train.drop(['location','keyword'],axis=1)
train=train.sample(frac=1,random_state=42)

In [None]:
plt.figure(figsize=(3,3))
sns.countplot(train,x='target')
plt.xlabel('')
plt.title('target')
plt.show()

In [None]:
X=train.text.values
y=train.target.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
vectorizer=TfidfVectorizer()
X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [None]:
def objective(trial):
    params={
        'kernel' : trial.suggest_categorical('kernel', ['rbf']),
        'C': trial.suggest_float('C',0.1,1000,log=True),
        'gamma':trial.suggest_float('gamma',0.0001,1,log=True)
    }
    clf = SVC(**params)
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

In [None]:
best_params=study.best_params
best_params

In [None]:
model=SVC(kernel=best_params['kernel'],C=best_params['C'],gamma=best_params['gamma'])

In [None]:
model.fit(X_train,y_train)

In [None]:
model.score(X_train,y_train)

In [None]:
model.score(X_test,y_test)

In [None]:
test=pd.read_csv('test.csv')
test=test.set_index('id')
test=test.drop(['location','keyword'],axis=1)
test_vec=vectorizer.transform(test.text.values)
y_pred=model.predict(test_vec)
result=pd.DataFrame(zip(test.index.tolist(),y_pred),columns=['id','target'])
result.to_csv('submission.csv',index=False)