In [1]:
from nltk import tokenize
from nltk import word_tokenize
from collections import Counter
import os
import re
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from textblob import TextBlob

In [2]:
tfidf_transformer = TfidfVectorizer()

In [41]:
def classifier_factory(df, textcol, catcol):
    df[textcol]=df[textcol].apply(str)
    X = df[textcol]
    y = df[catcol]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)
    text_clf = Pipeline([("tfidf", TfidfVectorizer()),("clf", LinearSVC(max_iter=100000))])
    text_clf.fit(X_train, y_train)
    predictions = text_clf.predict(X_test)
    print(f"{metrics.accuracy_score(y_test, predictions):.4f}")
    score = metrics.accuracy_score(y_test, predictions)
    return text_clf, score

def load_and_transform(lang):
    cons = pd.read_excel(f"consumer_{lang}/cons_{lang}.xlsx", engine="openpyxl")
    not_cons = pd.read_excel(f"consumer_{lang}/not_cons_{lang}.xlsx", engine="openpyxl")
    cons["tag"] = 1
    not_cons["tag"] = 0
    return pd.concat([cons, not_cons]).sample(frac=1)

def dump_model(model, lang):
    pkl_filename = f"consumer_{lang}/consumer_{lang}.pkl"
    with open(pkl_filename, 'wb') as file:
        pickle.dump(model, file)
        
def load_model(lang):
    pkl_filename = f"consumer_{lang}/consumer_{lang}.pkl"
    with open(pkl_filename, 'rb') as file:
        pickle_model = pickle.load(file)
    return pickle_model

def test_on_csv(path, model, lang, textcol="Treść wypowiedzi"):
    try:
        test_set = pd.read_csv(path, sep=";", engine="c", usecols=[textcol, "Rodzaj wzmianki"])
    except:
        test_set = pd.read_excel(path, header=6)
        
    if "Rodzaj wzmianki" in test_set.columns:
        test_set = test_set[test_set["Rodzaj wzmianki"] != "Artykuł"]
        
    test_set = test_set.drop_duplicates(subset=textcol)
    test_set[textcol] = test_set[textcol].apply(str)    
    test_set["tags"] = test_set[textcol].apply(lambda x: model.predict([x]))
    print(test_set["tags"].apply(lambda x: x[0]).value_counts(normalize=True))
    test_set = test_set[[textcol,"tags"]]
    test_set = test_set[test_set[textcol].apply(len)>100]
    
    test_set.to_excel(f"consumer_{lang}/test_results_{lang}.xlsx")
    

In [1]:
df_pl = load_and_transform("pl")
consumer_pl, score = classifier_factory(df_pl, "text", "tag")

while score < 0.94:
    df_pl = load_and_transform("pl")
    consumer_pl, score = classifier_factory(df_pl, "text", "tag")
x = dump_model(consumer_pl, "pl")
