In [33]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns

In [4]:
df=pd.read_csv('datasets/news.csv')
df

Unnamed: 0.1,Unnamed: 0,news,source
0,0,Официальный аккаунт PlayStation опубликовал т...,1.0
1,1,Китайская компания Mobvoi опубликовала на офи...,1.0
2,2,Практически во всех странах мира введены огра...,0.0
3,3,"Депутат Госдумы, единоросс Антон Горелкин вне...",2.0
4,4,Совет директоров «Почты России» проголосовал ...,2.0
...,...,...,...
19457,19457,С 17 декабря в России блокируется один из кру...,
19458,19458,Первые упоминания разработки тёмной темы в Go...,
19459,19459,Компания Oppo презентовала два новых продукта...,
19460,19460,"Основой презентации стала Windows 10, которая...",


In [5]:
df[df['source'].notnull()]

Unnamed: 0.1,Unnamed: 0,news,source
0,0,Официальный аккаунт PlayStation опубликовал т...,1.0
1,1,Китайская компания Mobvoi опубликовала на офи...,1.0
2,2,Практически во всех странах мира введены огра...,0.0
3,3,"Депутат Госдумы, единоросс Антон Горелкин вне...",2.0
4,4,Совет директоров «Почты России» проголосовал ...,2.0
...,...,...,...
18355,18355,С 17 декабря в России блокируется один из кру...,1.0
18356,18356,Первые упоминания разработки тёмной темы в Go...,1.0
18357,18357,Компания Oppo презентовала два новых продукта...,1.0
18358,18358,"Основой презентации стала Windows 10, которая...",0.0


In [6]:
n_clusters=len(df[df['source'].notnull()]['source'].unique())

In [7]:
def _clear_text_cyr_lat(text): #чистка от пунктуации и пр.
            try: 
                text=re.sub(r'[^а-яА-ЯёЁa-zA-Z]',' ', text)
                return ' '.join(text.split())  
            except TypeError:    
                print(type(text))
                return ''
def _quick_lemmas(series): # быстрая лемматизация
        series=series.apply(_clear_text_cyr_lat).apply(str.lower)
        unique_words_set = set()
        for keyword in series:
            for word in keyword.split(' '):
                if len(word)>0:
                    unique_words_set.add(word)
        unique_words_list =list(unique_words_set)        
        from pymystem3 import Mystem
        m = Mystem()
        unique_words = ' '.join(unique_words_set).lstrip()
        lemmas_list = m.lemmatize(unique_words)
        lemmas_list = lemmas_list[:-1]
        lemmas_list = [x for x in lemmas_list if x != ' ']
        words_dict={}
        words_dict = dict(zip(unique_words_list, lemmas_list))
        def _do_lemmas(keyword):
            lemmas = ''

            for word in keyword.split(' '):
                if len(word)>0:
                    lemmas +=' '+words_dict[word]

            return lemmas
        series = series.apply(_do_lemmas)
        return series

In [8]:
df['news']=df['news'].apply(lambda x: _clear_text_cyr_lat(x))

In [9]:
df['news'] = _quick_lemmas(df['news'])

In [10]:
df['len']=df['news'].apply(lambda x: len(x.split(' ')))
df['len'].max()

6315

In [11]:
vect = TfidfVectorizer(max_features=6315)

In [12]:
vect.fit(df['news'])

TfidfVectorizer(max_features=6315)

In [19]:
features = pd.DataFrame(vect.transform(df['news']).toarray()) 

In [22]:
df_vect =pd.concat([df,features], axis=1)

In [23]:
df_vect

Unnamed: 0.1,Unnamed: 0,news,source,len,0,1,2,3,4,5,...,6305,6306,6307,6308,6309,6310,6311,6312,6313,6314
0,0,официальный аккаунт playstation опубликовыват...,1.0,81,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,китайский компания mobvoi опубликовывать на о...,1.0,104,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,практически во все страна мир вводить огранич...,0.0,414,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,депутат госдума единоросс антон горелкин внос...,2.0,537,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,совет директор почта россия проголосовать за ...,2.0,82,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19457,19457,с декабрь в россия блокироваться один из круп...,,102,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19458,19458,первый упоминание разработка темный тема в go...,,114,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19459,19459,компания oppo презентовать два новый продукт ...,,169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19460,19460,основа презентация становиться windows которы...,,936,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
to_predict = df_vect[df_vect['source'].isnull()].drop(columns=['Unnamed: 0','news','len'])

In [27]:
to_predict

Unnamed: 0,source,0,1,2,3,4,5,6,7,8,...,6305,6306,6307,6308,6309,6310,6311,6312,6313,6314
18360,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
18361,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
18362,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
18363,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08405
18364,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19457,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
19458,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
19459,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
19460,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000


In [28]:
to_train = df_vect[df_vect['source'].notnull()].drop(columns=['Unnamed: 0','news','len'])

In [30]:
X, X_val, y, y_val = train_test_split(to_train.drop(columns='source'), to_train['source'], test_size=0.25)

In [31]:
from sklearn.svm import SVC

In [34]:
model=SVC(random_state=111)
model.fit(X,y)
pred = model.predict(X_val)
accuracy_score(y_val,pred)

0.9675381263616558

In [36]:
prediction = model.predict(to_predict.drop(columns='source'))

In [37]:
pd.DataFrame(prediction).to_csv('news.csv', header=False, index=False)