In [None]:
import pandas as pd
df = pd.read_csv('data.csv')
df.head()

In [None]:
# 增加一列困难人数百分比Number in hard mode/Number of reported results
# 转换日期格式
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date')
df['hard_mode_percentage'] = df['Number in hard mode']/df['Number of  reported results']
df.head()

In [None]:
# 计算七列tries的加权平均值，行内数值为权重，分别x1,x2,x3,x4,x5,x6,x7
df['mean_tries'] = (df['1 try']*1 + df['2 tries']*2 + df['3 tries']*3 + df['4 tries']*4 + df['5 tries']*5 + df['6 tries']*6+df['7 or more tries (X)']*7 )/100

In [None]:
# 增加一列熵和频率度量的加权和，结果在valid_words_entropy_map.json，对应Word查询
import json
with open('valid_words_entropy_map.json', 'r') as f:
    entropy_map = json.load(f)
df['entropy with freq'] = df['Word'].map(entropy_map)
df.head()

In [None]:
# 增加一列是否有重复字母 1或者0
df['has_duplicate'] = df['Word'].apply(lambda x: 1 if len(set(x)) != len(x) else 0)
# 增加一列重复字母的个数
df['duplicate_count'] = df['Word'].apply(lambda x: len(x) - len(set(x)))

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
word_list = df['Word'].to_list()

# flatten the word list into a list of letters
letter_list = [letter for word in list(word_list) for letter in word]

count = Counter(letter_list)
plt.rc("figure", autolayout=True, figsize=(13, 5))
ax = plt.bar(*zip(*count.most_common()), color='green')
plt.title("Valid solutions letter frequency")

In [None]:
# 增加一列计算每行字母的频率之和
df['letter_freq_sum'] = df['Word'].apply(lambda x: sum([count[letter] for letter in x]))

In [None]:
word_list = df['Word'].to_list()

letter_list = [word[0] for word in list(word_list)]

count = Counter(letter_list)
plt.rc("figure", autolayout=True, figsize=(13, 5))
ax = plt.bar(*zip(*count.most_common()), color='green')
plt.title("Valid solutions first-letter frequency")

In [None]:
# 增加一列计算每行首字母的频率
df['first_letter_freq'] = df['Word'].apply(lambda x: count[x[0]])
df.head()

In [None]:
df.to_csv('data_with_features.csv', index=False)

In [None]:
import pandas as pd
df = pd.read_csv('data_with_features.csv')
df.head()

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor
# 取出word entropy with freq，duplicate_count,letter_freq_sum,first_letter_freq和1-7的tries得到一个新的dataframe
train_data = df[['Word', 'entropy with freq', 'duplicate_count', 'letter_freq_sum', 'first_letter_freq', '1 try', '2 tries', '3 tries', '4 tries', '5 tries', '6 tries', '7 or more tries (X)','mean_tries']]
train_data.to_csv('train_data.csv', index=False)

In [None]:
train_data = TabularDataset('train_data.csv')

In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.common.utils.utils import setup_outputdir
from autogluon.core.utils.loaders import load_pkl
from autogluon.core.utils.savers import save_pkl
import os.path

class MultilabelPredictor():
    """ Tabular Predictor for predicting multiple columns in table.
        Creates multiple TabularPredictor objects which you can also use individually.
        You can access the TabularPredictor for a particular label via: `multilabel_predictor.get_predictor(label_i)`

        Parameters
        ----------
        labels : List[str]
            The ith element of this list is the column (i.e. `label`) predicted by the ith TabularPredictor stored in this object.
        path : str, default = None
            Path to directory where models and intermediate outputs should be saved.
            If unspecified, a time-stamped folder called "AutogluonModels/ag-[TIMESTAMP]" will be created in the working directory to store all models.
            Note: To call `fit()` twice and save all results of each fit, you must specify different `path` locations or don't specify `path` at all.
            Otherwise files from first `fit()` will be overwritten by second `fit()`.
            Caution: when predicting many labels, this directory may grow large as it needs to store many TabularPredictors.
        problem_types : List[str], default = None
            The ith element is the `problem_type` for the ith TabularPredictor stored in this object.
        eval_metrics : List[str], default = None
            The ith element is the `eval_metric` for the ith TabularPredictor stored in this object.
        consider_labels_correlation : bool, default = True
            Whether the predictions of multiple labels should account for label correlations or predict each label independently of the others.
            If True, the ordering of `labels` may affect resulting accuracy as each label is predicted conditional on the previous labels appearing earlier in this list (i.e. in an auto-regressive fashion).
            Set to False if during inference you may want to individually use just the ith TabularPredictor without predicting all the other labels.
        kwargs :
            Arguments passed into the initialization of each TabularPredictor.

    """

    multi_predictor_file = 'multilabel_predictor.pkl'

    def __init__(self, labels, path=None, problem_types=None, eval_metrics=None, consider_labels_correlation=True, **kwargs):
        if len(labels) < 2:
            raise ValueError("MultilabelPredictor is only intended for predicting MULTIPLE labels (columns), use TabularPredictor for predicting one label (column).")
        if (problem_types is not None) and (len(problem_types) != len(labels)):
            raise ValueError("If provided, `problem_types` must have same length as `labels`")
        if (eval_metrics is not None) and (len(eval_metrics) != len(labels)):
            raise ValueError("If provided, `eval_metrics` must have same length as `labels`")
        self.path = setup_outputdir(path, warn_if_exist=False)
        self.labels = labels
        self.consider_labels_correlation = consider_labels_correlation
        self.predictors = {}  # key = label, value = TabularPredictor or str path to the TabularPredictor for this label
        if eval_metrics is None:
            self.eval_metrics = {}
        else:
            self.eval_metrics = {labels[i] : eval_metrics[i] for i in range(len(labels))}
        problem_type = None
        eval_metric = None
        for i in range(len(labels)):
            label = labels[i]
            path_i = self.path + "Predictor_" + label
            if problem_types is not None:
                problem_type = problem_types[i]
            if eval_metrics is not None:
                eval_metric = eval_metrics[i]
            self.predictors[label] = TabularPredictor(label=label, problem_type=problem_type, eval_metric=eval_metric, path=path_i, **kwargs)

    def fit(self, train_data, tuning_data=None, **kwargs):
        """ Fits a separate TabularPredictor to predict each of the labels.

            Parameters
            ----------
            train_data, tuning_data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                See documentation for `TabularPredictor.fit()`.
            kwargs :
                Arguments passed into the `fit()` call for each TabularPredictor.
        """
        if isinstance(train_data, str):
            train_data = TabularDataset(train_data)
        if tuning_data is not None and isinstance(tuning_data, str):
            tuning_data = TabularDataset(tuning_data)
        train_data_og = train_data.copy()
        if tuning_data is not None:
            tuning_data_og = tuning_data.copy()
        else:
            tuning_data_og = None
        save_metrics = len(self.eval_metrics) == 0
        for i in range(len(self.labels)):
            label = self.labels[i]
            predictor = self.get_predictor(label)
            if not self.consider_labels_correlation:
                labels_to_drop = [l for l in self.labels if l != label]
            else:
                labels_to_drop = [self.labels[j] for j in range(i+1, len(self.labels))]
            train_data = train_data_og.drop(labels_to_drop, axis=1)
            if tuning_data is not None:
                tuning_data = tuning_data_og.drop(labels_to_drop, axis=1)
            print(f"Fitting TabularPredictor for label: {label} ...")
            predictor.fit(train_data=train_data, tuning_data=tuning_data, **kwargs)
            self.predictors[label] = predictor.path
            if save_metrics:
                self.eval_metrics[label] = predictor.eval_metric
        self.save()

    def predict(self, data, **kwargs):
        """ Returns DataFrame with label columns containing predictions for each label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to make predictions for. If label columns are present in this data, they will be ignored. See documentation for `TabularPredictor.predict()`.
            kwargs :
                Arguments passed into the predict() call for each TabularPredictor.
        """
        return self._predict(data, as_proba=False, **kwargs)

    def predict_proba(self, data, **kwargs):
        """ Returns dict where each key is a label and the corresponding value is the `predict_proba()` output for just that label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to make predictions for. See documentation for `TabularPredictor.predict()` and `TabularPredictor.predict_proba()`.
            kwargs :
                Arguments passed into the `predict_proba()` call for each TabularPredictor (also passed into a `predict()` call).
        """
        return self._predict(data, as_proba=True, **kwargs)

    def evaluate(self, data, **kwargs):
        """ Returns dict where each key is a label and the corresponding value is the `evaluate()` output for just that label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to evalate predictions of all labels for, must contain all labels as columns. See documentation for `TabularPredictor.evaluate()`.
            kwargs :
                Arguments passed into the `evaluate()` call for each TabularPredictor (also passed into the `predict()` call).
        """
        data = self._get_data(data)
        eval_dict = {}
        for label in self.labels:
            print(f"Evaluating TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            eval_dict[label] = predictor.evaluate(data, **kwargs)
            if self.consider_labels_correlation:
                data[label] = predictor.predict(data, **kwargs)
        return eval_dict

    def save(self):
        """ Save MultilabelPredictor to disk. """
        for label in self.labels:
            if not isinstance(self.predictors[label], str):
                self.predictors[label] = self.predictors[label].path
        save_pkl.save(path=self.path+self.multi_predictor_file, object=self)
        print(f"MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('{self.path}')")

    @classmethod
    def load(cls, path):
        """ Load MultilabelPredictor from disk `path` previously specified when creating this MultilabelPredictor. """
        path = os.path.expanduser(path)
        if path[-1] != os.path.sep:
            path = path + os.path.sep
        return load_pkl.load(path=path+cls.multi_predictor_file)

    def get_predictor(self, label):
        """ Returns TabularPredictor which is used to predict this label. """
        predictor = self.predictors[label]
        if isinstance(predictor, str):
            return TabularPredictor.load(path=predictor)
        return predictor

    def _get_data(self, data):
        if isinstance(data, str):
            return TabularDataset(data)
        return data.copy()

    def _predict(self, data, as_proba=False, **kwargs):
        data = self._get_data(data)
        if as_proba:
            predproba_dict = {}
        for label in self.labels:
            print(f"Predicting with TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            if as_proba:
                predproba_dict[label] = predictor.predict_proba(data, as_multiclass=True, **kwargs)
            data[label] = predictor.predict(data, **kwargs)
        if not as_proba:
            return data[self.labels]
        else:
            return predproba_dict

In [None]:
labels = ['1 try', '2 tries', '3 tries', '4 tries', '5 tries', '6 tries', '7 or more tries (X)','mean_tries']  # which columns to predict based on the others
problem_types = ['regression','regression','regression','regression','regression','regression','regression','regression']  # type of each prediction problem (optional)
eval_metrics = ['mean_absolute_error','mean_absolute_error','mean_absolute_error','mean_absolute_error','mean_absolute_error','mean_absolute_error','mean_absolute_error','mean_absolute_error']  # metrics used to evaluate predictions for each label (optional)
save_path = 'agModels-predictEducationClass'  # specifies folder to store trained models (optional)

time_limit = 600  # how many seconds to train the TabularPredictor for each label, set much larger in your applications!

In [None]:
multi_predictor = MultilabelPredictor(labels=labels, problem_types=problem_types, eval_metrics=eval_metrics, path=save_path)
multi_predictor.fit(train_data, time_limit=time_limit)

In [None]:
# 新建一个与训练集列名相同的dataframe
test_data = pd.read_csv('test.csv')
test_data['entropy with freq'] = test_data['Word'].map(entropy_map)
test_data['letter_freq_sum'] = test_data['Word'].apply(lambda x: sum([count[letter] for letter in x]))
test_data['first_letter_freq'] = test_data['Word'].apply(lambda x: count[x[0]])
test_data.to_csv('test.csv', index=False)

In [4]:
test_data = TabularDataset('test.csv')
predictions = multi_predictor.predict(test_data)
print("Predictions:  \n", predictions)

Predicting with TabularPredictor for label: 1 try ...
Predicting with TabularPredictor for label: 2 tries ...
Predicting with TabularPredictor for label: 3 tries ...
Predicting with TabularPredictor for label: 4 tries ...
Predicting with TabularPredictor for label: 5 tries ...
Predicting with TabularPredictor for label: 6 tries ...
Predicting with TabularPredictor for label: 7 or more tries (X) ...
Predicting with TabularPredictor for label: mean_tries ...
Predictions:  
           1 try   2 tries   3 tries    4 tries    5 tries   6 tries  \
0  3.277114e-14  1.722217  9.310204  27.911457  34.776711  21.72345   

   7 or more tries (X)  mean_tries  
0             6.381194    4.425645  


In [None]:
# 分析df中mean_tries列的分布 计算均值 标准差等 利用plotly画箱型图小提琴图
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import numpy as np
# 分析与可视化
data = df['mean_tries']
fig = px.box(data, y="mean_tries")
fig.show()

In [None]:
trace1 = go.Box(y=data,name='mean_tries')
layout = go.Layout(plot_bgcolor='#ffffff',width=500,height=500)
data = [trace1]
fig = go.Figure(data=data,layout=layout)
fig.show()

In [None]:
# 画小提琴图
fig = ff.create_violin(data, data_header='mean_tries', height=500, width=500)
fig.show()

In [None]:
import plotly.express as px
data = pd.DataFrame()
# data 是mean_tries列的数据乘以1减去hard_mode_percentage列的数据
data['scores'] = df['mean_tries'] * (1-df['hard_mode_percentage'])
fig = px.violin(data, y="scores", box=True, # 在小提琴内部绘制方框图
                points='all', # 可以是 'outliers', or False
               )
fig.update_layout(title='Violin and Box Plot')
fig.show()


In [3]:
multi_predictor = MultilabelPredictor.load('agModels-predictEducationClass')

In [None]:
# 利用data['scores']绘制饼状图
# 小于3.63的为beginner 3.63-3.84为challenger 3.84-4.10为master 4.10以上为legendary
import plotly.express as px
values = [len(data[data['scores']<3.63]),len(data[(data['scores']>=3.63) & (data['scores']<3.84)]),len(data[(data['scores']>=3.84) & (data['scores']<4.10)]),len(data[data['scores']>=4.10])]
labels = ['beginner','challenger','master','legendary']
fig = px.pie(values=values, names=labels)
fig.show()
