In [1]:
from nltk import tokenize
from nltk import word_tokenize
from collections import Counter
import os
import re
# import spacy
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from textblob import TextBlob

In [2]:
# nlp_pl = spacy.load("pl_core_news_lg")
tfidf_transformer = TfidfVectorizer()

In [7]:
def classifier_factory(df, textcol, catcol):
    df[textcol]=df[textcol].apply(str)
    X = df[textcol]
    y = df[catcol]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)
    text_clf = Pipeline([("tfidf", TfidfVectorizer()),("clf", LinearSVC(max_iter=100000))])
    text_clf.fit(X_train, y_train)
    predictions = text_clf.predict(X_test)
    print(f"{metrics.accuracy_score(y_test, predictions):.4f}")
    score = metrics.accuracy_score(y_test, predictions)
    return text_clf, score

def load_and_transform(lang):
    stories = pd.read_excel(f"aoidos_{lang}/stories_{lang}.xlsx")
    not_stories = pd.read_excel(f"aoidos_{lang}/not_stories_{lang}.xlsx")
    stories["tag"] = 1
    not_stories["tag"] = 0
    return pd.concat([stories, not_stories]).sample(frac=1)

def dump_model(model, lang):
    pkl_filename = f"aoidos_{lang}/aoidos_{lang}.pkl"
    with open(pkl_filename, 'wb') as file:
        pickle.dump(model, file)
        
def load_model(lang):
    pkl_filename = f"aoidos_{lang}/aoidos_{lang}.pkl"
    with open(pkl_filename, 'rb') as file:
        pickle_model = pickle.load(file)
    return pickle_model

def test_on_csv(path, model, lang, textcol="Treść wypowiedzi"):
    test_set = pd.read_csv(path, sep=";", engine="c", usecols=[textcol, "Rodzaj wzmianki"])
    if "Rodzaj wzmianki" in test_set.columns:
        test_set = test_set[test_set["Rodzaj wzmianki"] != "Artykuł"]
    test_set = test_set.drop_duplicates(subset=textcol)
    test_set[textcol] = test_set[textcol].apply(str)    
    test_set["tags"] = test_set[textcol].apply(lambda x: model.predict([x]))
    print(test_set["tags"].apply(lambda x: x[0]).value_counts(normalize=True))
    test_set = test_set[["Treść wypowiedzi","tags"]]
    test_set = test_set[test_set["Treść wypowiedzi"].apply(len)>100]
    
    test_set.to_excel(f"aoidos_{lang}/test_results_{lang}.xlsx")
    

In [8]:
df_pl = load_and_transform("pl")
aoidos_pl, score = classifier_factory(df_pl, "text", "tag")

while score < 0.94:
    df_pl = load_and_transform("pl")
    aoidos_pl, score = classifier_factory(df_pl, "text", "tag")
x = dump_model(aoidos_pl, "pl")


0.8999
0.9054
0.9152
0.9096
0.9152
0.9235
0.9193
0.9124
0.9110
0.9138
0.9068
0.9318
0.9249
0.9152
0.9221
0.9110
0.9082
0.9138
0.9166
0.9207
0.9152


KeyboardInterrupt: 

In [37]:
df_en = load_and_transform("en")
aoidos_en = classifier_factory(df_en, "text", "tag")[0]
dump_model(aoidos_en, "en")

0.9675


In [6]:
test_on_csv('~\\desktop\\dat.csv', aoidos_pl, 'pl')

0    0.977856
1    0.022144
Name: tags, dtype: float64
