# Парсинг и предобработка данных

In [None]:
#Импортирование необходимых библиотек
import pandas as pd
import numpy as np
from pprint import pprint
import codecs
import json
import glob
pd.set_option('display.max_columns', None)
from pandas import json_normalize

**1 способ**

In [None]:
#Путь к файлам .geojson
path = 'Data'
file = glob.glob(path + "/*.json")


#Цикл для получения файла и его загрузки, используя json.load 
df_data=[]
for filename in file:
    name = filename.split("\\")[-1][:-5]
    
    with codecs.open(filename, 'r', 'utf-8-sig') as json_file:  
        data = json.load(json_file)
           
    for article in data['refs']:
        if article!=None:
            df_data.append([name, article[0], article[1]['day'], article[1]['month'], article[1]['time']])
            
df_full=pd.DataFrame(data=df_data,columns=['Название файла', 'Пост', 'день публикации', 'месяц публикации', 'время публикации'])

In [None]:
df_full.head(5)

**2 способ**

In [None]:
#Путь к файлам .geojson
path = 'Data'
file = glob.glob(path + "/*.json")
df_full=pd.DataFrame()

#df_full=pd.DataFrame(columns=['Пост', 'день публикации', 'месяц публикации', 'время публикации'])
#Цикл для получения файла и его загрузки, используя json.load 
for filename in file:
    name = filename.split("\\")[-1][:-5]
    with codecs.open(filename, 'r', 'utf-8-sig') as json_file:  
        data = json.load(json_file)
    
    for article in data['refs']:  
        if article != None:
            df=pd.concat(
                [
                    pd.DataFrame([article[0]],columns=['Post']),
                    json_normalize(article[1]),
                    pd.DataFrame([name],columns=['Company'])
                ],
                axis=1
            )
            df_full=pd.concat([df_full,df],axis=0,ignore_index=True)
        

In [None]:
df_full.info()

In [None]:
df_full.head()

Датафрейм с информацией о компании

In [None]:
#Путь к файлам .geojson
path = 'Data'
file = glob.glob(path + "/*.json")


df = pd.DataFrame(columns=['rate','subs','industries','about','Company']) 

#Цикл для получения файла и его загрузки, используя json.load 
for filename in file:
    with codecs.open(filename, 'r', 'utf-8-sig') as json_file:  
        data = json.load(json_file)
        name = filename.split("\\")[-1][:-5]   
        try:
            company_info=pd.concat([json_normalize(data['info']),pd.DataFrame([name],columns=['Company'])],axis=1)
        except:
            d={'rate':['Не указано'],'subs':['Не указано'],'industries':['Не указано'],'about':['Не указано']}
            company_info=pd.concat([pd.DataFrame(d),pd.DataFrame([name],columns=['Company'])],axis=1)
    df = pd.concat([df,company_info], axis=0, ignore_index=True)
df.head()    
   

In [None]:
tk = df_full.merge(df, on='Company',how='left')

In [None]:
tk.shape

In [None]:
tk.head()

# Обработка текста

In [None]:
import pymorphy2
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
sw = stopwords.words('russian')
morph = pymorphy2.MorphAnalyzer()

def clear_text(text):
    text=text.lower()
    text = re.sub(r'[^а-яё ]','', str(text))
    tokens=word_tokenize(text, language="russian")
    tokens = [morph.parse(i)[0].normal_form for i in tokens]
    tokens = [ i for i in tokens if i not in sw and len(i) > 3]
    return tokens

In [None]:
tk['lemmatize_tokens'] = tk['Post'].apply(clear_text)

In [None]:
tk.head()

In [None]:
tk['clear_text'] = tk['lemmatize_tokens'].apply(lambda x: " ".join(x))

In [None]:
tk.head()

In [None]:
tk.to_csv('data.csv', index=False)

# Векторизация текста и поиск ngram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer


In [None]:
tfidf = TfidfVectorizer(min_df=5,max_df=0.8, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(tk['clear_text'])
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns = tfidf.get_feature_names())
df_tfidf.head()

In [None]:
X_tfidf

In [None]:
df_tfidf["Company"]=tk["Company"]

 # Кластеризация

In [None]:
from sklearn.cluster import KMeans, Birch, MiniBatchKMeans
from sklearn.decomposition import PCA

In [None]:
model = KMeans(n_clusters=5)

In [None]:
reduced_data = PCA(n_components=2).fit_transform(X_tfidf.toarray())
model.fit_transform(reduced_data)
df_tfidf["cluster"] = model.predict(reduced_data)

In [None]:
df_tfidf["cluster"]

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
print("silhouette_score -", silhouette_score(reduced_data, df_tfidf["cluster"]))

# Классификация

In [None]:
df=pd.read_json("Target1.json")
df = df.rename(columns = {"Сompany":"Company"})
df

In [None]:
df_tfidf["Company"]

In [None]:
df_tfidf=df_tfidf.merge(df, on='Company')
df_tfidf['Nominations']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,4))
sns.histplot(data=df_tfidf,x='Nominations')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x=df_tfidf.drop(['Nominations', "Company"], axis=1)
y=df_tfidf['Nominations']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42, stratify=y)

In [None]:
from sklearn.tree import DecisionTreeClassifier as Tree


In [None]:
tree = Tree(max_depth=20, min_samples_split=4, min_samples_leaf=2)

In [None]:
tree.fit(x_train, y_train)

# Оценка модели

In [None]:
predictions = tree.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))