In [161]:
import requests
import pandas as pd
from datetime import datetime
import time
from datetime import timedelta
import pickle
import re
import preprocessor as p
from nltk.corpus import stopwords
import pymorphy2
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import warnings
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
import collections
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

# Part 1

### Retrieving posts 
Posts were retrived from the following vk grops: 
* https://vk.com/itmomagistry 
* https://vk.com/careers_service_itmo 
* https://vk.com/itmoru 
* https://vk.com/itmo_exchange 
* https://m.vk.com/scicomm).

In [217]:
def get_posts(group):

    count = 100
    offset = 0 
    posts = []
    retrieved_posts = 'yes'
    
    while retrieved_posts:

        url = 'https://api.vk.com/method/wall.get'
        params = {
            'domain': group,
            'filter': 'owner',
            'count': count,
            'offset': offset,
            'access_token': '8e84f1f18e84f1f18e84f1f1db8eeb68e388e848e84f1f1d0bb32bd2b9ea45a67630377',
            'v': 5.73
        }

        retrieved_posts = requests.get(url, params = params).json()['response']['items']

        posts += retrieved_posts 
        offset += count   
        time.sleep(0.5)
            
    return posts

In [218]:
groups = ['itmomagistry', 'careers_service_itmo', 'itmoru', 'itmo_exchange', 'scicomm']

posts = []
for group in groups:
    posts += get_posts(group=group)

In [3]:
# pickle.dump( posts, open( "posts.p", "wb" ) )
posts = pickle.load( open( "posts.p", "rb" ) )

### Splitting posts into old (older than a year ago) and new (later than a year ago) one. 
The old ones will be used for training. The new ones will be used for testing. 

In [4]:
def split_train_test(posts):
    test = []
    train = []
    
    year_ago = (datetime.now()-timedelta(days=365)).timestamp()
    
    for post in posts:
        if post['date'] > year_ago:
            test.append(post)
        else:
            train.append(post)
            
    return test, train

In [183]:
test_posts, train_posts = split_train_test(posts)

In [6]:
md(f'We have retrieved {len(posts)} posts.<br>{len(train_posts)} from them were published before February 2019. These posts will be used for training<br>{len(test_posts)} were published after February 2019. These posts will be used for testing')

We have retrieved 13586 posts.<br>11095 from them were published before February 2019. These posts will be used for training<br>2491 were published after February 2019. These posts will be used for testing

#### Post example

In [182]:
train_posts[0]

{'id': 3763,
 'from_id': -54201931,
 'owner_id': -54201931,
 'date': 1549890800,
 'marked_as_ads': 0,
 'post_type': 'post',
 'text': '💡ITMO Open Science: битва мегадеканов, питчи ученых и научная выставка в крупнейшем музее современного искусства\n📝 Подробности читайте по ссылке: http://news.ifmo.ru/ru/education/official/news/8237/',
 'attachments': [{'type': 'link',
   'link': {'url': 'http://news.ifmo.ru/ru/education/official/news/8237/',
    'title': 'ITMO Open Science: битва мегадеканов, питчи ученых и научная выставка в крупнейшем музее современног',
    'caption': 'news.ifmo.ru',
    'description': '8 февраля, в День российской науки, Университет ИТМО провел научную конференцию в новом формате — ITMO Open Science. На протяжении дня в одном из крупнейших музеев современного искусства России «Эрарта» выступили руководители подразделений, молодые исследователи и аспиранты вуза. Ученые в нестандартном формате рассказали об актуальных достижениях в различных областях науки, а также по

# Part 2 

###  Retrieving text from posts and getting rid of duplicate posts

In [7]:
def get_post_text(post):
    if 'copy_history' in post:
        text = post['copy_history'][0]['text']
    else:
        text = post['text']
    return text.lower()

In [184]:
test_texts = {get_post_text(post) for post in test_posts}
train_texts = {get_post_text(post) for post in train_posts}

In [9]:
md(f'Found {len(train_posts) - len(train_texts)} duplicate posts from test set and {len(test_posts) - len(test_texts)} from train set.')

Found 622 duplicate posts from test set and 101 from train set.

### Filtering out posts that mention internships, scholarships, grants etc.

In [10]:
def get_texts_by_keywords(texts, words = {'стажировк', 'стажер', 'стажёр', 'scholarship', 'стипенд', 'грант', 'intern', 'обмен'}):
    return [text for text in texts if any(word in text for word in words)]

In [11]:
train_texts = get_texts_by_keywords(train_texts)

### Dumping 500 of selected posts to excel sheet in order to label them manually.

In [250]:
pd.DataFrame(zip(train_texts_final[:500], [0]*500)).to_excel('train.xlsx', index = False)

# Part 3

In the following section we'll perform text classification.
<br>**Input**: texts of posts mentionning internships, scholarships etc. with a corresponding class (labeled mabually)
<br>**Output**: one of the foolowing classes


#### Classes:
* 0 - other
* 1 - internships
* 3 - scholarships, grants

### Retrieving labeled data from excel sheet.

In [102]:
df = pd.read_excel('train.xlsx')
X_not_processed = df[0]
y = df[1]

### Preprocessing data:
* removing urls, emojis and numbers
* removing punctuation signs
* lemmatization 
* removing stopwords

In [54]:
def preprocess(text, morph, stopwords):
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.NUMBER)
    text = p.clean(text)

    words = [morph.parse(word)[0].normal_form for word in re.findall(r'\w+', text)]
    words = [word for word in words if word not in stopwords]
    return " ".join(words)

In [55]:
morph = pymorphy2.MorphAnalyzer()
# russian_stopwords = get_stop_words('ru')
russian_stopwords = stopwords.words("russian")

#### Example
**Before preprocessing:**

In [103]:
X_not_processed[0]

'последняя неделя приема заявок на конкурс #gotech!\n \ngotech (ex – web ready) – конкурс №1 для технологических компаний c 2009 года. узнайте больше и станьте участником на www.gotech.vc \n\n14 номинаций, участники которых получат возможности: \n\n• попасть в акселерационные программы intel и google; \n• запустить совместные go-to-market проекты с «лабораторией касперского», гк «мортон», банком «санкт-петербург», b2b-center и емс; \n• получить гранты и инвестиции от фонда «сколково», runa capital, phystech ventures, the untitled, idealmachine и фрии; \n• рассказать о своем проекте 100 инвесторам и 1000 участникам форума gotech; \n• получить возможность представлять россию на web summit, slush, tech open air berlin, wolves summit, hub conference, pirate summit, seedstarsworld и других крупнейших стартап-событиях; \n• поехать на road show в финляндию; \n• запустить продажи продукта в сша и открыть там офис благодаря программам idm usa landing, hack temple и starta accelerator. \n• получ

**After preprocessing:**

In [104]:
preprocess(X[0], morph, russian_stopwords)

'последний неделя приём заявка конкурс gotech gotech ex web ready конкурс 1 технологический компания c год узнать большой стать участник номинация участник который получить возможность попасть акселерационный программа intel google запустить совместный go to market проект лаборатория касперский гк мортон банк санкт петербург b2b center емс получить грант инвестиция фонд сколково runa capital phystech ventures the untitled idealmachine фрий рассказать свой проект инвестор участник форум gotech получить возможность представлять россия web summit slush tech open air berlin wolves summit hub conference pirate summit seedstarsworld крупный стартап событие поехать road show финляндия запустить продажа продукт сша открыть офис благодаря программа idm usa landing hack temple starta accelerator получить оценка профессиональный сообщество привлечь интерес сми потенциальный клиент'

In [105]:
X = [preprocess(text, morph, russian_stopwords) for text in X]

### Splitting train data into training and validation sets.

In [59]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

### Choosing the best classifier

In [60]:
vectorizers = {'TfidfVectorizer':TfidfVectorizer(), 
               'CountVectorizer':CountVectorizer()
              }
classifiers = {'GradientBoostingClassifier':GradientBoostingClassifier(random_state = 0), 
               'RandomForestClassifier':RandomForestClassifier(random_state = 0), 
               'LinearSVC':LinearSVC(random_state = 0), 
               'MLPClassifier':MLPClassifier(random_state = 0)}

In [61]:
f1_scores = {}
for vectorizer_name, vectorizer in vectorizers.items():
    scores = {}
    X_train_vec = vectorizer.fit_transform(X_train)
    X_val_vec = vectorizer.transform(X_val)
    for classifier_name, classifier in classifiers.items():
        classifier.fit(X_train_vec, y_train)
        predictions = classifier.predict(X_val_vec)
        scores[classifier_name] = f1_score(y_val, predictions, average = 'weighted')
    f1_scores[vectorizer_name] = scores

### Classification results

In [62]:
pd.DataFrame(f1_scores)

Unnamed: 0,TfidfVectorizer,CountVectorizer
GradientBoostingClassifier,0.781883,0.784039
LinearSVC,0.81108,0.774358
MLPClassifier,0.779818,0.771987
RandomForestClassifier,0.717159,0.738743


Looks like the most promissing results are achieved by the **GradientBoostingClassifier** together with **TfidfVectorizer**. 
<br>LinearSVC together with **TfidfVectorizer** gives a slightly worse result while being much simpler than GradientBoostingClassifier. The latter looks too sophisticated for our simple problem6 so let's stick to the LinearSVC model and try to improve this model's performance by tuning hyperparameters using GridSearchCV.

In [63]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [66]:
param_grid = {'C':np.arange(0.01,100,10), 'max_iter':[1, 5, 10, 50, 100]}
gs = GridSearchCV(LinearSVC(),param_grid,cv=5,return_train_score=True)
gs.fit(X_train_vec,y_train)
best_parameters = gs.best_params_

In [67]:
print(f'Best parameters: {best_parameters}')

Best parameters: {'C': 90.01, 'max_iter': 5}


In [68]:
clf = LinearSVC(**best_parameters, random_state = 0)
clf.fit(X_train_vec,y_train)
predictions = clf.predict(X_val_vec)

In [69]:
best_f1_score = np.around(f1_score(y_val, predictions, average = 'weighted'), decimals=2)

In [70]:
print(f'F1 score of the model with tuned hyperparameters: {best_f1_score}')

F1 score of the model with tuned hyperparameters: 0.82


# Part 4
In this section we'll try to detect topics of posts about internships and about scolarships.
<br>First, let's try to extract topics automatically using LDA model.

In [99]:
def lda(values, num_topics=3, num_words=5):
    values = [preprocess(text, morph, russian_stopwords).split() for text in values]
    dictionary = corpora.Dictionary(values)
    corpus = [dictionary.doc2bow(text) for text in values]
    ldamodel = LdaModel(corpus, num_topics, id2word=dictionary, passes=15)
    topics = ldamodel.print_topics(num_words)
    return topics

In [100]:
selected_posts = {'internships': [post for post, c in zip(X_not_processed, predictions) if c == 1],
                  'scholarships': [post for post, c in zip(X_not_processed, predictions) if c == 3]}

In [101]:
for type, values in selected_posts.items():
    print(type, '\n')
    topics = lda(values)
    for i, topic in enumerate(topics):
        print(i, topic)
    print()

internships 

0 (0, '0.012*"компания" + 0.009*"работа" + 0.006*"мочь" + 0.006*"получить" + 0.005*"собеседование" + 0.005*"это" + 0.005*"возможность" + 0.005*"кандидат" + 0.005*"тестирование" + 0.005*"год"')
1 (1, '0.018*"стажировка" + 0.015*"итмый" + 0.012*"университет" + 0.011*"студент" + 0.011*"компания" + 0.009*"конкурс" + 0.008*"itmo" + 0.008*"программа" + 0.007*"год" + 0.006*"участие"')
2 (2, '0.010*"студент" + 0.009*"карта" + 0.009*"банковский" + 0.008*"компания" + 0.007*"сентябрь" + 0.007*"факультет" + 0.006*"стипендия" + 0.006*"0" + 0.006*"дать" + 0.005*"технология"')

scholarships 

0 (0, '0.010*"год" + 0.006*"стажировка" + 0.006*"it" + 0.005*"карьер" + 0.005*"ru" + 0.005*"конкурс" + 0.005*"00" + 0.005*"информация" + 0.005*"вакансия" + 0.005*"ярмарка"')
1 (1, '0.011*"компания" + 0.009*"который" + 0.008*"работа" + 0.007*"конкурс" + 0.006*"вопрос" + 0.005*"свой" + 0.005*"весь" + 0.005*"пройти" + 0.005*"ваш" + 0.004*"дать"')
2 (2, '0.014*"студент" + 0.010*"стажировка" + 0.008*"ун

Unfortunately, there there doesn't seem to be any topics that could be easily extracted by the model (even if we tune number of topics). So let's use the good old string matching to find **the most wanted skills** in the intership posts.

First, let's look at the most popular words in the posts.

In [140]:
internship_texts = [preprocess(text, morph, russian_stopwords) for text in selected_posts['internships']]
internship_words = [text.split() for text in internship_texts]
internship_words = [item for sublist in internship_words for item in sublist]
counter = collections.Counter(internship_words)
print(counter.most_common(200))

[('компания', 56), ('стажировка', 52), ('студент', 44), ('итмый', 41), ('университет', 31), ('работа', 29), ('конкурс', 26), ('год', 25), ('свой', 24), ('это', 24), ('программа', 23), ('itmo', 23), ('получить', 22), ('возможность', 20), ('курс', 20), ('весь', 19), ('проект', 18), ('участие', 18), ('стипендия', 17), ('мочь', 16), ('информация', 15), ('заявка', 14), ('который', 14), ('обучение', 14), ('пройти', 14), ('вопрос', 14), ('команда', 14), ('время', 14), ('резюме', 14), ('россия', 13), ('также', 13), ('собеседование', 13), ('технология', 12), ('специалист', 12), ('регистрация', 12), ('сентябрь', 12), ('карта', 12), ('петербург', 11), ('грант', 11), ('открытый', 11), ('язык', 11), ('система', 11), ('проходить', 11), ('кандидат', 11), ('банковский', 11), ('приглашать', 10), ('наш', 10), ('российский', 10), ('дать', 10), ('научный', 10), ('апрель', 10), ('факультет', 10), ('хороший', 10), ('приём', 9), ('санкт', 9), ('крупный', 9), ('офис', 9), ('ведущий', 9), ('развитие', 9), ('пр

### Now let's choose the words that correspond to some programmer's skills and are relatively popular in dataset.

In [127]:
key_words = ['java', 'c', 'python', 'javascript', 'frontend', 'backend', 'linux', 'office',
             'bi', 'data science', 'анализ дать', 'машинный обучение', 'machine learning', 'аналитик', 
             'тестировщик', 'тестирование', 'английский', 'дизайн', 'разработка', 'разработчик', 'инженер','1с']

In [169]:
def get_most_popular_skills(key_words, texts):
    skills_dict = dict.fromkeys(key_words, 0)
    for text in texts:
        for word in key_words:
            if word in text:
                skills_dict[word] += 1
    skills_dict = {skill: count for skill, count in sorted(skills_dict.items(), key=lambda item: item[1], reverse=True)}
    return skills_dict

In [174]:
skills_dict = get_most_popular_skills(key_words, internship_texts)
for skill, count in skills_dict.items():
    print(f'{skill}: {count}')

c: 28
разработка: 5
английский: 4
java: 3
аналитик: 3
тестирование: 3
инженер: 3
тестировщик: 2
дизайн: 2
разработчик: 2
python: 1
linux: 1
bi: 1
javascript: 0
frontend: 0
backend: 0
office: 0
data science: 0
анализ дать: 0
машинный обучение: 0
machine learning: 0
1с: 0


# Part 5
## Test data
Now let's take a look at the test set, i.e. texts published before Ferbuary 2019.

#### First, let's use our classifier to extract texts that announce internships and scholarships. 

In [185]:
test_texts_orig = get_texts_by_keywords(test_texts)
test_texts = [preprocess(text, morph, russian_stopwords) for text in test_texts_orig]

In [165]:
test_texts_vec = vectorizer.transform(test_texts)
test_predictions = clf.predict(test_texts_vec)

#### Now let's dump texts and our predictions to a csv file.

In [195]:
pd.DataFrame(zip(test_texts_orig, test_predictions), columns = ['text', 'class']).to_csv('part_3.csv')

In [166]:
test_internships = [post for post, c in zip(test_texts, test_predictions) if c == 1]

#### What are the most wanted skills in 2019-2020?

In [173]:
test_skills_dict = get_most_popular_skills(key_words, test_internships)
for skill, count in test_skills_dict.items():
    print(f'{skill}: {count}')

c: 55
аналитик: 13
разработчик: 13
инженер: 13
английский: 12
разработка: 12
тестирование: 11
bi: 9
java: 8
python: 7
frontend: 6
backend: 4
javascript: 3
linux: 3
дизайн: 3
data science: 2
машинный обучение: 2
анализ дать: 1
тестировщик: 1
office: 0
machine learning: 0
1с: 0


### What can we tell from the data?
* Most of the companies were looking for developpers, most of them for c and c++ languages.
* Machine Learning, python, data science, business intelligence speciallists are much more lickely to find an internship in 2020 tham in the previous years.

In [187]:
def get_keywords_from_text(key_words, text):
    result = []
    for word in key_words:
        if word in text:
            result.append(word)
    return result

In [192]:
test_internships_keywords = [get_keywords_from_text(key_words, text) for text in test_internships]

#### Now let's dump internships texts and our predictions to a csv file.

In [197]:
pd.DataFrame(zip(test_internships, test_internships_keywords), columns = ['text', 'keywords']).to_csv('part_4.csv')