In [9]:
import requests
import pandas as pd
from datetime import datetime
import time
from datetime import timedelta
import pickle
import re
import preprocessor as p
from nltk.corpus import stopwords
import pymorphy2
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import warnings
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
import collections
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from IPython.display import Markdown as md

# Part 1

### Retrieving posts 
Posts were retrieved from the following vk groups: 
* https://vk.com/itmomagistry 
* https://vk.com/careers_service_itmo 
* https://vk.com/itmoru 
* https://vk.com/itmo_exchange 
* https://m.vk.com/scicomm).

In [217]:
def get_posts(group):

    count = 100
    offset = 0 
    posts = []
    retrieved_posts = 'yes'
    
    while retrieved_posts:

        url = 'https://api.vk.com/method/wall.get'
        params = {
            'domain': group,
            'filter': 'owner',
            'count': count,
            'offset': offset,
            'access_token': '8e84f1f18e84f1f18e84f1f1db8eeb68e388e848e84f1f1d0bb32bd2b9ea45a67630377',
            'v': 5.73
        }

        retrieved_posts = requests.get(url, params = params).json()['response']['items']

        posts += retrieved_posts 
        offset += count   
        time.sleep(0.5)
            
    return posts

In [218]:
groups = ['itmomagistry', 'careers_service_itmo', 'itmoru', 'itmo_exchange', 'scicomm']

posts = []
for group in groups:
    posts += get_posts(group=group)

In [10]:
# pickle.dump( posts, open( "posts.p", "wb" ) )
posts = pickle.load( open( "posts.p", "rb" ) )

### Splitting posts into old (older than a year ago) and new (later than a year ago) ones. 
The old ones will be used for training. The new ones will be used for testing. 

In [11]:
def split_train_test(posts):
    test = []
    train = []
    
    year_ago = (datetime.now()-timedelta(days=365)).timestamp()
    
    for post in posts:
        if post['date'] > year_ago:
            test.append(post)
        else:
            train.append(post)
            
    return test, train

In [12]:
test_posts, train_posts = split_train_test(posts)

In [13]:
md(f'We have retrieved {len(posts)} posts.<br>{len(train_posts)} from them were published before February 2019. These posts will be used for training<br>{len(test_posts)} were published after February 2019. These posts will be used for testing')

We have retrieved 13586 posts.<br>11105 from them were published before February 2019. These posts will be used for training<br>2481 were published after February 2019. These posts will be used for testing

#### Post example

In [14]:
train_posts[0]

{'id': 3764,
 'from_id': -54201931,
 'owner_id': -54201931,
 'date': 1549961199,
 'marked_as_ads': 0,
 'post_type': 'post',
 'text': '‚ú®–î–µ–Ω—å —Ä–æ—Å—Å–∏–π—Å–∫–æ–π –Ω–∞—É–∫–∏.\n–ü—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω—ã–π –ø—Ä–∞–∑–¥–Ω–∏–∫ –æ—Ç–º–µ—á–∞–µ—Ç—Å—è –≤ —á–µ—Å—Ç—å –æ—Å–Ω–æ–≤–∞–Ω–∏—è –≤ –Ω–∞—à–µ–π —Å—Ç—Ä–∞–Ω–µ –ê–∫–∞–¥–µ–º–∏–∏ –Ω–∞—É–∫. –û–Ω–∞ –±—ã–ª–∞ —Å–æ–∑–¥–∞–Ω–∞ 295 –ª–µ—Ç –Ω–∞–∑–∞–¥.\nüìù –ü–æ–¥—Ä–æ–±–Ω–æ—Å—Ç–∏ —á–∏—Ç–∞–π—Ç–µ –ø–æ —Å—Å—ã–ª–∫–µ: \nhttps://www.1tv.ru/news/2019-02-08/360076-segodnya_den_rossiyskoy_nauki',
 'attachments': [{'type': 'video',
   'video': {'access_key': '27143b75f5296c628d',
    'can_comment': 0,
    'can_like': 1,
    'can_repost': 1,
    'can_subscribe': 1,
    'can_add_to_faves': 1,
    'can_add': 1,
    'comments': 0,
    'date': 1549961199,
    'description': '–ü—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω—ã–π –ø—Ä–∞–∑–¥–Ω–∏–∫ –æ—Ç–º–µ—á–∞–µ—Ç—Å—è –≤\xa0—á–µ—Å—Ç—å –æ—Å–Ω–æ–≤–∞–Ω–∏—è –≤\xa0–Ω–∞—à–µ–π —Å—Ç—Ä–∞–Ω–µ –ê–∫–∞–¥–µ–º–∏–∏ –Ω–∞—É–∫. –û–Ω–∞ –±—ã–ª–∞ —Å–æ–∑–¥–∞–

# Part 2 

###  Retrieving text from posts and getting rid of duplicate posts

In [15]:
def get_post_text(post):
    if 'copy_history' in post:
        text = post['copy_history'][0]['text']
    else:
        text = post['text']
    return text.lower()

In [16]:
test_texts = {get_post_text(post) for post in test_posts}
train_texts = {get_post_text(post) for post in train_posts}

In [17]:
md(f'Found {len(train_posts) - len(train_texts)} duplicate posts from test set and {len(test_posts) - len(test_texts)} from train set.')

Found 622 duplicate posts from test set and 101 from train set.

### Filtering out posts that mention internships, scholarships, grants etc.

In [18]:
def get_texts_by_keywords(texts, words = {'—Å—Ç–∞–∂–∏—Ä–æ–≤–∫', '—Å—Ç–∞–∂–µ—Ä', '—Å—Ç–∞–∂—ë—Ä', 'scholarship', '—Å—Ç–∏–ø–µ–Ω–¥', '–≥—Ä–∞–Ω—Ç', 'intern', '–æ–±–º–µ–Ω'}):
    return [text for text in texts if any(word in text for word in words)]

In [19]:
train_texts = get_texts_by_keywords(train_texts)

### Dumping 500 of selected posts to excel sheet in order to label them manually.

In [66]:
pd.DataFrame(zip(train_texts[:500], [0]*500)).to_excel('train_dump.xlsx', index = False)

# Part 3

In the following section we'll perform text classification.
<br>**Input**: texts of posts mentionning internships, scholarships etc. with a corresponding class (labeled manually)
<br>**Output**: one of the following classes


#### Classes:
* 0 - other
* 1 - internships
* 3 - scholarships, grants

### Retrieving labeled data from excel sheet.

In [67]:
df = pd.read_excel('train.xlsx')
X_not_processed = df[0]
y = df[1]

### Preprocessing data:
* removing urls, emojis and numbers
* removing punctuation signs
* lemmatization 
* removing stopwords

In [68]:
def preprocess(text, morph, stopwords):
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.NUMBER)
    text = p.clean(text)

    words = [morph.parse(word)[0].normal_form for word in re.findall(r'\w+', text)]
    words = [word for word in words if word not in stopwords]
    return " ".join(words)

In [69]:
morph = pymorphy2.MorphAnalyzer()
# russian_stopwords = get_stop_words('ru')
russian_stopwords = stopwords.words("russian")

#### Example
**Before preprocessing:**

In [70]:
X_not_processed[0]

'üîÜ —Ñ–æ—Ç–æ–Ω–∏–∫–∞ üîÜ –æ–ø—Ç–∏–∫–∞ üîÜ –ª–∞–∑–µ—Ä—ãüîÜ\n\n–Ω–æ–≤–æ—Å—Ç—å - –±–æ–º–±–∞ üí£ —Å–ø–µ—Ü–∏–∞–ª—å–Ω–æ –¥–ª—è –≤–∞—Å! \n\n–ø—Ä–∏ –ø–æ–¥–¥–µ—Ä–∂–∫–µ –æ—Ä–≥–∞–Ω–∏–∑–∞—Ç–æ—Ä–∞–º–∏ –æ–ª–∏–º–ø–∏–∞–¥—ã ¬´—è - –ø—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª¬ª –≤ –ø—è—Ç–Ω–∏—Ü—É, 14 –¥–µ–∫–∞–±—Ä—è, –æ—Ä–≥–∞–Ω–∏–∑–æ–≤–∞–Ω–∞ —ç–∫—Å–∫—É—Ä—Å–∏—è –≤ –∫–æ–º–ø–∞–Ω–∏—é ¬´–ª–∞–∑–µ—Ä–Ω—ã–µ —Å–∏—Å—Ç–µ–º—ã¬ª (http://www.lsystems.ru/company/) –≤ –ø–æ—Å. —Å—Ç—Ä–µ–ª—å–Ω–∞. \n¬´–ª–∞–∑–µ—Ä–Ω—ã–µ —Å–∏—Å—Ç–µ–º—ã¬ª ‚Äì —Å–æ–≤—Ä–µ–º–µ–Ω–Ω–æ–µ –∏–Ω–Ω–æ–≤–∞—Ü–∏–æ–Ω–Ω–æ–µ –ø—Ä–µ–¥–ø—Ä–∏—è—Ç–∏–µ, —Ä–∞–±–æ—Ç–∞—é—â–µ–µ –≤ —Å—Ñ–µ—Ä–µ –ª–∞–∑–µ—Ä–Ω—ã—Ö —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π –∏ –æ–ø—Ç–æ—ç–ª–µ–∫—Ç—Ä–æ–Ω–Ω—ã—Ö —Å–∏—Å—Ç–µ–º.\n–Ω–∞ —ç–∫—Å–∫—É—Ä—Å–∏–∏ –≤—ã –Ω–µ —Ç–æ–ª—å–∫–æ –±–ª–∏–∂–µ –ø–æ–∑–Ω–∞–∫–æ–º–∏—Ç–µ—Å—å —Å –∫–æ–º–ø–∞–Ω–∏–µ–π –∏ –ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ –ø—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–æ, –Ω–æ –∏ —É–∑–Ω–∞–µ—Ç–µ –æ —Å–≤–æ–∏—Ö –∫–∞—Ä—å–µ—Ä–Ω—ã—Ö –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—è—Ö –≤ –µ—ë —Ä–∞–º–∫–∞—Ö (—Å—Ç–∞–∂–∏—Ä–æ–≤–∫–∏, –ø—Ä–∞–∫—Ç–∏–∫–∏, –≤–∞–∫–∞–Ω—Å–∏

**After preprocessing:**

In [72]:
preprocess(X_not_processed[0], morph, russian_stopwords)

'—Ñ–æ—Ç–æ–Ω–∏–∫ –æ–ø—Ç–∏–∫ –ª–∞–∑–µ—Ä –Ω–æ–≤–æ—Å—Ç—å –±–æ–º–±–∞ —Å–ø–µ—Ü–∏–∞–ª—å–Ω–æ –ø–æ–¥–¥–µ—Ä–∂–∫–∞ –æ—Ä–≥–∞–Ω–∏–∑–∞—Ç–æ—Ä –æ–ª–∏–º–ø–∏–∞–¥–∞ –ø—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª –ø—è—Ç–Ω–∏—Ü–∞ –¥–µ–∫–∞–±—Ä—å –æ—Ä–≥–∞–Ω–∏–∑–æ–≤–∞—Ç—å —ç–∫—Å–∫—É—Ä—Å–∏—è –∫–æ–º–ø–∞–Ω–∏—è –ª–∞–∑–µ—Ä–Ω—ã–π —Å–∏—Å—Ç–µ–º–∞ –ø–æ—Å —Å—Ç—Ä–µ–ª—å–Ω–∞ –ª–∞–∑–µ—Ä–Ω—ã–π —Å–∏—Å—Ç–µ–º–∞ —Å–æ–≤—Ä–µ–º–µ–Ω–Ω—ã–π –∏–Ω–Ω–æ–≤–∞—Ü–∏–æ–Ω–Ω—ã–π –ø—Ä–µ–¥–ø—Ä–∏—è—Ç–∏–µ —Ä–∞–±–æ—Ç–∞—Ç—å —Å—Ñ–µ—Ä–∞ –ª–∞–∑–µ—Ä–Ω—ã–π —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—è –æ–ø—Ç–æ—ç–ª–µ–∫—Ç—Ä–æ–Ω–Ω—ã–π —Å–∏—Å—Ç–µ–º–∞ —ç–∫—Å–∫—É—Ä—Å–∏—è –±–ª–∏–∑–∫–∏–π –ø–æ–∑–Ω–∞–∫–æ–º–∏—Ç—å—Å—è –∫–æ–º–ø–∞–Ω–∏—è –ø–æ—Å–º–æ—Ç—Ä–µ—Ç—å –ø—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–æ —É–∑–Ω–∞—Ç—å —Å–≤–æ–π –∫–∞—Ä—å–µ—Ä–Ω—ã–π –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –µ—ë —Ä–∞–º–∫–∞ —Å—Ç–∞–∂–∏—Ä–æ–≤–∫–∞ –ø—Ä–∞–∫—Ç–∏–∫–∞ –≤–∞–∫–∞–Ω—Å–∏—è –Ω–∞—á–∞—Ç—å —ç–∫—Å–∫—É—Ä—Å–∏—è 00 –æ–∫–æ–Ω—á–∞–Ω–∏–µ 00 —Ä–∞–∑–≤–æ–∑–∫–∞ –æ—Å—É—â–µ—Å—Ç–≤–ª—è—Ç—å—Å—è –º–µ—Ç—Ä –ø—Ä–æ—Å–ø–µ–∫—Ç –≤–µ—Ç–µ—Ä–∞–Ω –∑–∞–∫–∞–∑–Ω–æ–π –∞–≤—Ç–æ–±—É—Å –ø–æ—Ç–æ—Ä–æ–ø–∏—Ç—å—Å—è –∑–∞—

In [73]:
X = [preprocess(text, morph, russian_stopwords) for text in X_not_processed]

### Splitting train data into training and validation sets.

In [74]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

### Choosing the best classifier

In [75]:
vectorizers = {'TfidfVectorizer':TfidfVectorizer(), 
               'CountVectorizer':CountVectorizer()
              }
classifiers = {'GradientBoostingClassifier':GradientBoostingClassifier(random_state = 0), 
               'RandomForestClassifier':RandomForestClassifier(random_state = 0), 
               'LinearSVC':LinearSVC(random_state = 0), 
               'MLPClassifier':MLPClassifier(random_state = 0)}

In [76]:
f1_scores = {}
for vectorizer_name, vectorizer in vectorizers.items():
    scores = {}
    X_train_vec = vectorizer.fit_transform(X_train)
    X_val_vec = vectorizer.transform(X_val)
    for classifier_name, classifier in classifiers.items():
        classifier.fit(X_train_vec, y_train)
        predictions = classifier.predict(X_val_vec)
        scores[classifier_name] = f1_score(y_val, predictions, average = 'weighted')
    f1_scores[vectorizer_name] = scores

### Classification results

In [77]:
pd.DataFrame(f1_scores)

Unnamed: 0,TfidfVectorizer,CountVectorizer
GradientBoostingClassifier,0.783664,0.778332
LinearSVC,0.810141,0.774514
MLPClassifier,0.773646,0.780671
RandomForestClassifier,0.738774,0.753181


Looks like the most promissing results are achieved by the **GradientBoostingClassifier** together with **TfidfVectorizer**. 
<br>LinearSVC together with **TfidfVectorizer** gives a slightly worse result while being much simpler than GradientBoostingClassifier. The latter looks too sophisticated for our simple problem, so let's stick to the LinearSVC model and try to improve this model's performance by tuning hyperparameters using GridSearchCV.

In [78]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [79]:
param_grid = {'C':np.arange(0.01,100,10), 'max_iter':[1, 5, 10, 50, 100]}
gs = GridSearchCV(LinearSVC(),param_grid,cv=5,return_train_score=True)
gs.fit(X_train_vec,y_train)
best_parameters = gs.best_params_

In [80]:
print(f'Best parameters: {best_parameters}')

Best parameters: {'C': 80.01, 'max_iter': 5}


In [81]:
clf = LinearSVC(**best_parameters, random_state = 0)
clf.fit(X_train_vec,y_train)
predictions = clf.predict(X_val_vec)

In [82]:
best_f1_score = np.around(f1_score(y_val, predictions, average = 'weighted'), decimals=2)

In [83]:
print(f'F1 score of the model with tuned hyperparameters: {best_f1_score}')

F1 score of the model with tuned hyperparameters: 0.8


# Part 4
In this section we'll try to detect topics of posts about internships and about scholarships.
<br>First, let's try to extract topics automatically using LDA model.

In [84]:
def lda(values, num_topics=3, num_words=5):
    values = [preprocess(text, morph, russian_stopwords).split() for text in values]
    dictionary = corpora.Dictionary(values)
    corpus = [dictionary.doc2bow(text) for text in values]
    ldamodel = LdaModel(corpus, num_topics, id2word=dictionary, passes=15)
    topics = ldamodel.print_topics(num_words)
    return topics

In [85]:
selected_posts = {'internships': [post for post, c in zip(X_not_processed, predictions) if c == 1],
                  'scholarships': [post for post, c in zip(X_not_processed, predictions) if c == 3]}

In [86]:
for type, values in selected_posts.items():
    print(type, '\n')
    topics = lda(values)
    for i, topic in enumerate(topics):
        print(i, topic)
    print()

internships 

0 (0, '0.011*"—Å—Ç–∞–∂–∏—Ä–æ–≤–∫–∞" + 0.011*"–∫–æ–º–ø–∞–Ω–∏—è" + 0.008*"—Ä–∞–±–æ—Ç–∞" + 0.007*"–ø—Ä–∞–∫—Ç–∏–∫–∞" + 0.007*"–∏—Ç–º—ã–π" + 0.007*"–ª–∞–∑–µ—Ä–Ω—ã–π" + 0.006*"—Å–∏—Å—Ç–µ–º–∞" + 0.006*"—ç—Ç–æ" + 0.005*"—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞" + 0.005*"–æ–ø—ã—Ç"')
1 (1, '0.016*"—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç" + 0.016*"–∏—Ç–º—ã–π" + 0.008*"—Å—Ç—É–¥–µ–Ω—Ç" + 0.007*"–≥–æ–¥" + 0.007*"–ø—Ä–æ–µ–∫—Ç" + 0.007*"–∫–æ—Ç–æ—Ä—ã–π" + 0.006*"–∑–∞—è–≤–∫–∞" + 0.006*"–ø–æ–ª—É—á–∏—Ç—å" + 0.006*"—Å—Ç–∞–∂–∏—Ä–æ–≤–∫–∞" + 0.006*"–ø—Ä–æ–≥—Ä–∞–º–º–∞"')
2 (2, '0.010*"–∏—Ç–º—ã–π" + 0.009*"—Å—Ç–∞–∂–∏—Ä–æ–≤–∫–∞" + 0.009*"–∫–æ–º–ø–∞–Ω–∏—è" + 0.008*"—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç" + 0.007*"—Å–≤–æ–π" + 0.006*"—Å—Ç—É–¥–µ–Ω—Ç" + 0.006*"–∫–æ—Ç–æ—Ä—ã–π" + 0.006*"—Ä–∞–±–æ—Ç–∞" + 0.005*"–º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω—ã–π" + 0.005*"–Ω–∞—É—á–Ω—ã–π"')

scholarships 

0 (0, '0.015*"–∏—Ç–º—ã–π" + 0.010*"—Ä–∞–±–æ—Ç–∞" + 0.008*"—Å—Ç–∞–∂–∏—Ä–æ–≤–∫–∞" + 0.008*"itmo" + 0.008*"–º–µ—Ä–æ–ø—Ä–∏—è—Ç–∏–µ" + 0.008*"–∫—Ä—É–ø–Ω—ã–π" + 0.008*"–∫–æ–º–ø–∞–Ω–∏—è" +

Unfortunately, there doesn't seem to be any topics that could be easily extracted by the model (even if we tune number of topics). So let's use the good old string matching to find **the most wanted skills** in the intership posts.

First, let's look at the most popular words in the posts.

In [87]:
internship_texts = [preprocess(text, morph, russian_stopwords) for text in selected_posts['internships']]
internship_words = [text.split() for text in internship_texts]
internship_words = [item for sublist in internship_words for item in sublist]
counter = collections.Counter(internship_words)
print(counter.most_common(200))

[('–∏—Ç–º—ã–π', 72), ('—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç', 65), ('—Å—Ç–∞–∂–∏—Ä–æ–≤–∫–∞', 49), ('–∫–æ–º–ø–∞–Ω–∏—è', 47), ('—Å—Ç—É–¥–µ–Ω—Ç', 39), ('—Å–≤–æ–π', 33), ('—Ä–∞–±–æ—Ç–∞', 33), ('–ø—Ä–æ–µ–∫—Ç', 32), ('–∫–æ—Ç–æ—Ä—ã–π', 32), ('—ç—Ç–æ', 28), ('–ø–æ–ª—É—á–∏—Ç—å', 25), ('–º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω—ã–π', 25), ('–≥–æ–¥', 25), ('itmo', 25), ('–ø—Ä–æ–≥—Ä–∞–º–º–∞', 24), ('–≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å', 22), ('–∑–∞—è–≤–∫–∞', 21), ('–≤–µ—Å—å', 21), ('–∫–æ–Ω–∫—É—Ä—Å', 20), ('—É—á–∞—Å—Ç–∏–µ', 19), ('–æ–ø—ã—Ç', 18), ('—Å–∏—Å—Ç–µ–º–∞', 18), ('–ø—Ä–∞–∫—Ç–∏–∫–∞', 16), ('–≥—Ä–∞–Ω—Ç', 16), ('–∫–æ–º–∞–Ω–¥–∞', 15), ('—Ä–æ—Å—Å–∏—è', 15), ('—É–∑–Ω–∞—Ç—å', 15), ('—Ü–µ–Ω—Ç—Ä', 15), ('–æ–±–ª–∞—Å—Ç—å', 15), ('–æ—Ç–∫—Ä—ã—Ç—ã–π', 15), ('–Ω–∞—É—á–Ω—ã–π', 15), ('—Ä–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏—è', 15), ('—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞', 14), ('–ø—Ä–æ–π—Ç–∏', 14), ('ru', 14), ('–º–æ—á—å', 13), ('—Å–∞–º—ã–π', 12), ('—Å–º–æ—á—å', 12), ('–∫—Ä—É–ø–Ω—ã–π', 12), ('–ª–∞–∑–µ—Ä–Ω—ã–π', 12), ('–≤–æ–ø—Ä–æ—Å', 12), ('–ª–∞–±–æ—Ä–∞—Ç–æ—Ä–∏—è', 12), ('–¥–µ–Ω—å', 12),

### Now let's choose the words that correspond to some programmer's skills and are relatively popular in the dataset.

In [88]:
key_words = ['java', 'c', 'python', 'javascript', 'frontend', 'backend', 'linux', 'office',
             'bi', 'data science', '–∞–Ω–∞–ª–∏–∑ –¥–∞—Ç—å', '–º–∞—à–∏–Ω–Ω—ã–π –æ–±—É—á–µ–Ω–∏–µ', 'machine learning', '–∞–Ω–∞–ª–∏—Ç–∏–∫', 
             '—Ç–µ—Å—Ç–∏—Ä–æ–≤—â–∏–∫', '—Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ', '–∞–Ω–≥–ª–∏–π—Å–∫–∏–π', '–¥–∏–∑–∞–π–Ω', '—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞', '—Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫', '–∏–Ω–∂–µ–Ω–µ—Ä','1—Å']

In [89]:
def get_most_popular_skills(key_words, texts):
    skills_dict = dict.fromkeys(key_words, 0)
    for text in texts:
        for word in key_words:
            if word in text:
                skills_dict[word] += 1
    skills_dict = {skill: count for skill, count in sorted(skills_dict.items(), key=lambda item: item[1], reverse=True)}
    return skills_dict

In [90]:
skills_dict = get_most_popular_skills(key_words, internship_texts)
for skill, count in skills_dict.items():
    print(f'{skill}: {count}')

c: 39
—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞: 10
–¥–∏–∑–∞–π–Ω: 6
–∏–Ω–∂–µ–Ω–µ—Ä: 6
bi: 4
—Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ: 4
–∞–Ω–≥–ª–∏–π—Å–∫–∏–π: 4
—Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫: 3
python: 1
frontend: 1
–∞–Ω–∞–ª–∏–∑ –¥–∞—Ç—å: 1
–º–∞—à–∏–Ω–Ω—ã–π –æ–±—É—á–µ–Ω–∏–µ: 1
machine learning: 1
–∞–Ω–∞–ª–∏—Ç–∏–∫: 1
java: 0
javascript: 0
backend: 0
linux: 0
office: 0
data science: 0
—Ç–µ—Å—Ç–∏—Ä–æ–≤—â–∏–∫: 0
1—Å: 0


# Part 5
## Test data
Now let's take a look at the test set, i.e. texts published before Ferbuary 2019.

#### First, let's use our classifier to extract texts that announce internships and scholarships. 

In [91]:
test_texts_orig = get_texts_by_keywords(test_texts)
test_texts = [preprocess(text, morph, russian_stopwords) for text in test_texts_orig]

In [92]:
test_texts_vec = vectorizer.transform(test_texts)
test_predictions = clf.predict(test_texts_vec)

#### Now let's dump texts and our predictions to a csv file.

In [93]:
pd.DataFrame(zip(test_texts_orig, test_predictions), 
             columns = ['text', 'class']).to_csv('part_3.csv', index=False)

In [94]:
test_internships = [post for post, c in zip(test_texts, test_predictions) if c == 1]

#### What are the most wanted skills in 2019-2020?

In [95]:
test_skills_dict = get_most_popular_skills(key_words, test_internships)
for skill, count in test_skills_dict.items():
    print(f'{skill}: {count}')

c: 68
—Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫: 18
—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞: 17
–∏–Ω–∂–µ–Ω–µ—Ä: 17
–∞–Ω–∞–ª–∏—Ç–∏–∫: 14
–∞–Ω–≥–ª–∏–π—Å–∫–∏–π: 13
bi: 12
—Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ: 11
java: 10
python: 9
frontend: 6
backend: 4
–¥–∏–∑–∞–π–Ω: 4
javascript: 3
linux: 3
data science: 3
–º–∞—à–∏–Ω–Ω—ã–π –æ–±—É—á–µ–Ω–∏–µ: 3
–∞–Ω–∞–ª–∏–∑ –¥–∞—Ç—å: 1
—Ç–µ—Å—Ç–∏—Ä–æ–≤—â–∏–∫: 1
office: 0
machine learning: 0
1—Å: 0


### What can we tell from the data?
* Most of the companies were looking for developers, most of them for c and c++ languages.
* Machine Learning, python, data science, business intelligence speciallists are much more likely to find an internship in 2020 than in the previous years.

In [96]:
def get_keywords_from_text(key_words, text):
    result = []
    for word in key_words:
        if word in text:
            result.append(word)
    return result

In [97]:
test_internships_keywords = [get_keywords_from_text(key_words, text) for text in test_internships]

#### Now let's dump internships texts and our predictions to a csv file.

In [98]:
test_internships_orig = [post for post, c in zip(test_texts_orig, test_predictions) if c == 1]
pd.DataFrame(zip(test_internships_orig, test_internships_keywords), 
             columns = ['text', 'keywords']).to_csv('part_4.csv', index=False)