In [1]:
import pickle

with open('train_test_data.pickle', 'rb') as file:
    train_test_data = pickle.load(file)

In [2]:
import pandas as pd

In [3]:
train_df = pd.read_csv("train.csv")
VAL_DATA = pd.read_csv("test.csv")
# VAL_DATA = test_df['title'].values

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from matplotlib import pyplot as plt
from wordcloud import WordCloud
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

from my_librery import *

In [5]:
SEED = 42
TOKEN_PATTERN = "[а-яё]+"
STOP_WORDS = nltk.corpus.stopwords.words('russian') + nltk.corpus.stopwords.words('english')

In [18]:
from sklearn.preprocessing import RobustScaler, Normalizer
from sklearn.compose import ColumnTransformer
import sklearn.naive_bayes as nb
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    (
        'features',
        ColumnTransformer([
            (
                'url',
                TfidfVectorizer(max_df=0.05, min_df=1, ngram_range=(0, 3)),
                'url'
            ),
            (
                'title',
                TfidfVectorizer(max_df=0.05, min_df=4, ngram_range=(0, 3)),
                'title'
            )
        ])
    ),
    ('norm', Normalizer()),
    ('scale', RobustScaler(with_centering=False)),
    ('clf', nb.BernoulliNB())
])
fit_predict(pipeline, *train_test_data)

In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    (
        'features', 
        ColumnTransformer([
            (
                'url',
                TfidfVectorizer(max_df=0.05, min_df=3, ngram_range=(0, 3)), 
                'url'
            ),
            (
                'title',
                TfidfVectorizer(max_df=0.05, min_df=3, ngram_range=(0, 3)),
                'title'
            )
        ])
    ),
    ('clf', MultinomialNB(alpha=1))
])


parameters = {
    'features__title__max_df': [0.05, 0.04, 0.06, 0.08, 0.1],
    'features__title__min_df': [3,  2, 4, 5],
    'features__title__ngram_range': [(0, 3), (0, 2), (0, 4), (0, 5)],

    # Добавьте другие параметры, которые хотите варьировать
}

grid_search = GridSearchCV(pipeline, param_grid=parameters, scoring='f1', cv=5, n_jobs=-1, verbose=1)
grid_search.fit(train_test_data[0], train_test_data[1])

# Лучшие параметры и оценщик
print("Лучшие параметры:", grid_search.best_params_)
print("Лучший оценщик:", grid_search.best_estimator_)

In [6]:
def remove_unicode(df, column):
    df[column] = df[column].apply(lambda x: ''.join(i for i in x if ord(i)<128))
    return df

train_df_new = remove_unicode(train_test_data[0], 'title')
val_df_new = remove_unicode(train_test_data[2], 'title')

In [9]:
new_train_test_data = [train_df_new, train_test_data[1], val_df_new, train_test_data[3]]

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    (
        'features',
        ColumnTransformer([
            (
                'url',
                TfidfVectorizer(max_df=0.05, min_df=3, ngram_range=(0, 3)),
                'url'
            ),
            (
                'title',
                TfidfVectorizer(max_df=0.05, min_df=3, ngram_range=(0, 3)),
                'title'
            )
        ])
    ),
    ('clf', MultinomialNB(alpha=1))
])


parameters = {
    'features__title__max_df': [0.05, 0.04, 0.06, 0.08, 0.1],
    'features__title__min_df': [3,  2, 4, 5],
    'features__title__ngram_range': [(0, 3), (0, 2), (0, 4), (0, 5)],

    # Добавьте другие параметры, которые хотите варьировать
}

grid_search = GridSearchCV(pipeline, param_grid=parameters, scoring='f1', cv=5, n_jobs=-1, verbose=1)
grid_search.fit(new_train_test_data[0], new_train_test_data[1])

# Лучшие параметры и оценщик
print("Лучшие параметры:", grid_search.best_params_)
print("Лучший оценщик:", grid_search.best_estimator_)

In [45]:
the_best = Pipeline(steps=[('features',
                 ColumnTransformer(transformers=[('url',
                                                  TfidfVectorizer(max_df=0.05,
                                                                  min_df=3,
                                                                  ngram_range=(0,
                                                                               3)),
                                                  'url'),
                                                 ('title',
                                                  TfidfVectorizer(max_df=0.06,
                                                                  min_df=2,
                                                                  ngram_range=(0,
                                                                               2)),
                                                  'title')])),
                ('clf', MultinomialNB(alpha=1))])
fit_predict(the_best, *train_test_data)


In [39]:
the_best = Pipeline(steps=[('features',
                 ColumnTransformer(transformers=[('url',
                                                  TfidfVectorizer(max_df=0.05,
                                                                  min_df=3,
                                                                  ngram_range=(0,
                                                                               3)),
                                                  'url'),
                                                 ('title',
                                                  TfidfVectorizer(max_df=0.06,
                                                                  min_df=2,
                                                                  ngram_range=(0,
                                                                               2)),
                                                  'title')])),
                ('clf', MultinomialNB(alpha=1))])
# fit_predict(the_best, *new_train_test_data)


In [12]:
import numpy as np
all_data = pd.concat([new_train_test_data[0], new_train_test_data[2]])
all_y = np.concatenate([new_train_test_data[1], new_train_test_data[3]])

# Предположим, что df - это ваш DataFrame, а labels - это ваш массив numpy с метками классов
labels_df = pd.DataFrame(all_y, columns=['label'])
all_data.reset_index(drop=True, inplace=True)
# Конкатенируем df и labels_df
all_data_frame = pd.concat([all_data, labels_df],axis=1)
all_data_frame.shape

In [13]:
def check_total_length(text):
    return len(text.split()) > 3

# применяем функцию к колонке 'title' и оставляем только те строки, где общая длина всех слов больше или равна трем
df = all_data_frame[all_data_frame['title'].apply(check_total_length)]

df.shape

In [91]:
import pandas as pd
from collections import Counter
import re

# Предположим, что df - это ваш DataFrame, а train_values - это ваши метки классов


# Отфильтровываем строки, принадлежащие классу 1
class_1_titles = all_data_frame[all_data_frame['label'] == 0]['title']

# Разбиваем строки на слова и подсчитываем количество каждого слова
word_counts = Counter()
for title in class_1_titles:
    words = re.findall(r'\b\w+\b', title)
    word_counts.update(words)

# Выводим наиболее часто встречающиеся слова
# print(word_counts.most_common())
# Получаем список наиболее часто встречающихся слов
most_common_words = word_counts.most_common()


def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False
    
# Фильтруем список, чтобы оставить только слова, которые встречаются более 170 раз
words_over_170 = [word for word, count in most_common_words if count > 170 and not is_number(word)]

# Выводим результат
print(words_over_170)

In [109]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Функция для создания нового признака
def words_presence_feature(df, words):
    return df.apply(lambda x: 1 if any(word in x for word in words) else 0).to_frame()

# Создаем список слов, которые вы хотите проверить
words_to_check = ['porn', 'sex', 'fuck', 'dick', 'pussy', 'sperm', 'webcam', 'boobs',]

# words_to_check = ['porn', 'sex', 'xxx', 'girls', 'big', 'anal', 'naked', 'pussy', 'ass', 'biqle', 'tits', 'fucked', 'daftsex', 'blowjob', 'sexy', 'erotic', 'nude', 'dick', 'porno', 'fuck', 'fucks', 'fucking', 'erotica', 'milf', 'ancensored', 'cum', 'amateur', 'hardcore', 'adult', 'busty', 'lesbian', 'cock', 'homemade', 'xvideos', 'stockings', 'gay', 'chick', 'lesbians', 'boobs', 'masturbation', 'group']
for_1_class = FunctionTransformer(words_presence_feature, validate=False, kw_args={'words': words_to_check})
for_0_class = FunctionTransformer(words_presence_feature, validate=False, kw_args={'words': words_over_170})


In [100]:
# Добавляем новый признак в пайплайн
the_best_1 = Pipeline(steps=[
    ('features', ColumnTransformer(transformers=[
        ('url', TfidfVectorizer(max_df=0.05, min_df=3, ngram_range=(0, 3)), 'url'),
        ('title', TfidfVectorizer(max_df=0.06, min_df=2, ngram_range=(0, 2)), 'title'),
        ('for_1_class', for_1_class, 'title'),
        ('for_0_class', for_0_class, 'title')
    ])),
    ('clf', MultinomialNB(alpha=1))
])
# fit_predict(the_best_1, *new_train_test_data)

In [118]:
from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin

class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1, workers=4):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.model = None

    def fit(self, X, y=None):
        sentences = [doc.split() for doc in X]
        self.model = Word2Vec(sentences, vector_size=self.vector_size, window=self.window, min_count=self.min_count, workers=self.workers)
        return self

    def transform(self, X):
        return np.array([abs(np.mean([self.model.wv[w] for w in doc.split() if w in self.model.wv]
                                 or [np.zeros(self.vector_size)], axis=0))
                         for doc in X])



In [120]:
from sklearn.svm import SVC

the_best_1 = Pipeline(steps=[
    ('features', ColumnTransformer(transformers=[
        ('url', TfidfVectorizer(max_df=0.05, min_df=3, ngram_range=(0, 3)), 'url'),
        ('title', TfidfVectorizer(max_df=0.06, min_df=2, ngram_range=(0, 2)), 'title'),
        ('for_1_class', for_1_class, 'title'),
        ('for_0_class', for_0_class, 'title')
    ])),
    ('clf', MultinomialNB())
])
fit_predict(the_best_1, *new_train_test_data)

In [133]:
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk
def get_ngrams(text, n):
    n_grams = nltk.ngrams(text, n)
    return [''.join(grams) for grams in n_grams]
# sentence = 'лекция протексты'
# get_ngrams(sentence, 2)



# Создаем функцию для предобработки текста
def preprocess_text(text):
    # Токенизация
    words = word_tokenize(text)
    # words = get_ngrams(text, 7)

    # Удаление стоп-слов
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Лемматизация
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return words

# Используем функцию в TfidfVectorizer
the_best_1 = Pipeline(steps=[
    ('features', ColumnTransformer(transformers=[
        ('url', TfidfVectorizer(max_df=0.05, min_df=3, ngram_range=(0, 3), tokenizer=preprocess_text), 'url'),
        ('title', TfidfVectorizer(max_df=0.06, min_df=2, ngram_range=(0, 2), tokenizer=preprocess_text), 'title'),
        ('for_1_class', for_1_class, 'title'),
        ('for_0_class', for_0_class, 'title')
    ])),
    ('clf', MultinomialNB())
])
# fit_predict(the_best_1, *new_train_test_data)


In [134]:
the_best_1.fit(all_data, all_y)

In [38]:
len(all_data)

In [135]:
import pickle

with open('VAL_DATA.pickle', 'rb') as file:
    VAL_DATA = pickle.load(file)

In [136]:
results = the_best_1.predict(VAL_DATA)
test_df = pd.read_csv("test.csv")
test_df["label"] = results

test_df[["ID", "label"]].to_csv("Current_best_merged.csv", index=False)