In [1]:
from bs4 import BeautifulSoup as BS
import numpy as np
import pandas as pd

In [2]:
features_lv = ['LV_W', 'LV_WT', 'LV_WT1', 'LV_TTR', 'LV_CTTR', 'LV_RTTR', 'LV_HDD', 'LV_DUGA', 'LV_MAAS', 'LV_SUMM',
                         'LV_YULEK','LV_MTLD','LV_MSTTR','LV_MATTR']

#14 features
features_ls = ["LS_FPC_NG", "LS_FPC_NA", "LS_FPC_TC", "LS_FPC_BS", "LS_FPC_CA", "LS_FPC_CT", "LS_FPC_CGA1",
            "LS_FPC_CGA2", "LS_FPC_CGA3", "LS_FOMN_NG", "LS_FOMN_NA", "LS_FOMN_TC", "LS_FOMN_BS", "LS_FOMN_CA"]
#3 features
features_la = ["LA_ER", "LA_COL_ERR_R"]
#6 features
features_ca = ["CA_BIN1_R", "CA_BIN2_R", "CA_BIN3_R"]
#2 features
features_ld = ["LD_LXUR", "LD_GRUR"]

feature_names = features_lv + features_ls + features_la + features_ca + features_ld
feature_names = np.array(feature_names)
binning_indicies = [6, 18, 19, 20, 21, 22, 28, 29, 30, 31, 32, 33]

In [3]:
#pipeline


In [4]:
level = "a1"
path_to_bins = "bins/bin_values_level_"+level+".xml"
file = open(path_to_bins, "r")
bs = BS(file, "lxml")

binning_criterias = []

features = bs.find_all("feature")

feature_indicies = []
means = []
stdds = []

for index, feature in enumerate(features):
    feature_indicies.append(int(feature.find("feature_index").text))
    means.append(float(feature.find("mean").text))
    stdds.append(float(feature.find("std").text))

feature_indicies = np.array(feature_indicies).round(4)
means = np.array(means).round(4)
stdds = np.array(stdds).round(4)

data = np.vstack((feature_indicies, feature_names[binning_indicies], means, stdds, (means-stdds).round(4), (means+stdds).round(4)))
#print(data)
columns = ["Feature Index", "Feature code", "Mean", "Standard deviation", "mean-std", "mean+std"]

df = pd.DataFrame(data.T, columns=columns)


#read essays
essay_file = "dataset_binned/level_" + level + ".xml"
essays = []
grades = []
index = 0
with open(essay_file, "r") as f:
    bs = BS(f, "lxml")
    writings = bs.find_all("writing")
    for writing in writings:
        #if np.random.normal() < 0.1:
        #    continue
        essays.append(writing.find("text").text)
        grades.append(int(writing.find("grade").text))
        index += 1
        if index == 1000:
            break

bins = []

grades = np.array(grades)
feature_matrix = np.load("all_features_1000/feature_matrix_"+level+"_fm.npy")
grades_test = np.load("all_features_1000/feature_matrix_"+level+"_grades.npy")
assert (grades_test == grades).all()

for index, mean, std in zip(feature_indicies, means, stdds):
    bin1 = len(feature_matrix.T[index, feature_matrix.T[index] <= (mean-std)])
    bin2 = len(feature_matrix.T[index, ((feature_matrix.T[index] <= mean) &  (feature_matrix.T[index] > (mean - std)))])
    bin3 = len(feature_matrix.T[index, ((feature_matrix.T[index] < (mean+std)) &  (feature_matrix.T[index] > mean)) ])
    bin4 = len(feature_matrix.T[index, feature_matrix.T[index] >= (mean+std)]  )
    
    print("Feature code: ", feature_names[index])
    print("Bin1: {}%".format(bin1/10))
    print("Bin2: {}%".format(bin2/10))
    print("Bin3: {}%".format(bin3/10))
    print("Bin4: {}%".format(bin4/10))
    print()
    
    negative_feedback_index = np.argmax((feature_matrix.T[index] <= mean-std)) if ((feature_matrix.T[index] <= mean-std) != False).any() else -1
    neutral_feedback_index = np.argmax((feature_matrix.T[index] > (mean-std)) & (feature_matrix.T[index] < (mean+std))) if ((feature_matrix.T[index] > (mean-std)) & (feature_matrix.T[index] < (mean+std)) != False).any() else -1
    positive_feedback_index = np.argmax((feature_matrix.T[index] >= mean+std)) if ((feature_matrix.T[index] >= mean+std) != False).any() else -1
    
    if feature_names[index] == "LA_ER" or feature_names[index] == "LA_COL_ERR_R" or feature_names[index] == "CA_BIN1_R":
        temp = negative_feedback_index
        negative_feedback_index = positive_feedback_index
        positive_feedback_index = temp
        
    
    print("3 sample essays")
    print("Mean: {}, std: {}, Mean-std: {}, mean+std: {}".format(mean, std, mean-std, mean+std))
    print("---------------------------------------------------------------------------")
    print("Feedback: Positive")
    print("Essay: {}".format(essays[positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n".format(feature_matrix.T[index, positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("---------------------------------------------------------------------------")
    print("Feedback: Neutral")
    print("Essay: {}".format(essays[neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n".format(feature_matrix.T[index, neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("---------------------------------------------------------------------------")
    print("Feedback: Negative")
    print("Essay: {}".format(essays[negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n\n".format(feature_matrix.T[index, negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("===========================================================================")

Feature code:  LV_HDD
Bin1: 0.0%
Bin2: 91.4%
Bin3: 0.0%
Bin4: 8.6%

3 sample essays
Mean: 0.0608, std: 0.2002, Mean-std: -0.1394, mean+std: 0.261
---------------------------------------------------------------------------
Feedback: Positive
Essay: 
      I know a hotel in the city center next to the train station. it is very well placed for 5 min you a tram station and you're up to 10 min for shopping. It is located north of Montpellier in a beautiful area
    
Grade: 80
Feature value: 0.7380952380952386

---------------------------------------------------------------------------
Feedback: Neutral
Essay: 
      In the office there are computer, mouse. There are a lot of pen and pencil. There are a few windows. There are some desk and somechair. There are a few meeting rooms , restroons. There''s a Kitchen.
    
Grade: 95
Feature value: 0.0

---------------------------------------------------------------------------
Feedback: Negative
Essay: No sample
Grade: No sample
Feature value: No 

<h1>Level A2 bin percentages</h1>

In [5]:
level = "a2"
path_to_bins = "bins/bin_values_level_"+level+".xml"
file = open(path_to_bins, "r")
bs = BS(file, "lxml")

binning_criterias = []

features = bs.find_all("feature")

feature_indicies = []
means = []
stdds = []

for index, feature in enumerate(features):
    feature_indicies.append(int(feature.find("feature_index").text))
    means.append(float(feature.find("mean").text))
    stdds.append(float(feature.find("std").text))

feature_indicies = np.array(feature_indicies).round(4)
means = np.array(means).round(4)
stdds = np.array(stdds).round(4)

data = np.vstack((feature_indicies, feature_names[binning_indicies], means, stdds, (means-stdds).round(4), (means+stdds).round(4)))
#print(data)
columns = ["Feature Index", "Feature code", "Mean", "Standard deviation", "mean-std", "mean+std"]

df = pd.DataFrame(data.T, columns=columns)


#read essays
essay_file = "dataset_binned/level_" + level + ".xml"
essays = []
grades = []
index = 0
with open(essay_file, "r") as f:
    bs = BS(f, "lxml")
    writings = bs.find_all("writing")
    for writing in writings:
        #if np.random.normal() < 0.1:
        #    continue
        essays.append(writing.find("text").text)
        grades.append(int(writing.find("grade").text))
        index += 1
        if index == 1000:
            break

bins = []

grades = np.array(grades)
feature_matrix = np.load("all_features_1000/feature_matrix_"+level+"_fm.npy")
grades_test = np.load("all_features_1000/feature_matrix_"+level+"_grades.npy")
assert (grades_test == grades).all()

for index, mean, std in zip(feature_indicies, means, stdds):
    bin1 = len(feature_matrix.T[index, feature_matrix.T[index] <= (mean-std)])
    bin2 = len(feature_matrix.T[index, ((feature_matrix.T[index] <= mean) &  (feature_matrix.T[index] > (mean - std)))])
    bin3 = len(feature_matrix.T[index, ((feature_matrix.T[index] < (mean+std)) &  (feature_matrix.T[index] > mean)) ])
    bin4 = len(feature_matrix.T[index, feature_matrix.T[index] >= (mean+std)]  )
    
    print("Feature code: ", feature_names[index])
    print("Bin1: {}%".format(bin1/10))
    print("Bin2: {}%".format(bin2/10))
    print("Bin3: {}%".format(bin3/10))
    print("Bin4: {}%".format(bin4/10))
    print()
    
    negative_feedback_index = np.argmax((feature_matrix.T[index] <= mean-std)) if ((feature_matrix.T[index] <= mean-std) != False).any() else -1
    neutral_feedback_index = np.argmax((feature_matrix.T[index] > (mean-std)) & (feature_matrix.T[index] < (mean+std))) if ((feature_matrix.T[index] > (mean-std)) & (feature_matrix.T[index] < (mean+std)) != False).any() else -1
    positive_feedback_index = np.argmax((feature_matrix.T[index] >= mean+std)) if ((feature_matrix.T[index] >= mean+std) != False).any() else -1
    
    if feature_names[index] == "LA_ER" or feature_names[index] == "LA_COL_ERR_R" or feature_names[index] == "CA_BIN1_R":
        temp = negative_feedback_index
        negative_feedback_index = positive_feedback_index
        positive_feedback_index = temp
    
    print("3 sample essays")
    print("Mean: {}, std: {}, Mean-std: {}, mean+std: {}".format(mean, std, mean-std, mean+std))
    print("---------------------------------------------------------------------------")
    print("Feedback: Positive")
    print("Essay: {}".format(essays[positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n".format(feature_matrix.T[index, positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("---------------------------------------------------------------------------")
    print("Feedback: Neutral")
    print("Essay: {}".format(essays[neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n".format(feature_matrix.T[index, neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("---------------------------------------------------------------------------")
    print("Feedback: Negative")
    print("Essay: {}".format(essays[negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n\n".format(feature_matrix.T[index, negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("===========================================================================")

Feature code:  LV_HDD
Bin1: 16.8%
Bin2: 29.5%
Bin3: 38.3%
Bin4: 15.4%

3 sample essays
Mean: 0.7594, std: 0.0729, Mean-std: 0.6865, mean+std: 0.8322999999999999
---------------------------------------------------------------------------
Feedback: Positive
Essay: 
      To: xxx@gamil.com From: xxx@gamil.com Subject: networking event Hi, Amy. I just returned from a successful networking event with Tom. There were hundred of fashion designers at La Tom sharing contacts and ideas for new look. I think that Roman Scalio was excited about my sping desighs. Maybe he'll call me! I guess that he liked the Balinese fabrics. i'll tell you more on Monday! Winnie
    
Grade: 95
Feature value: 0.8843945171454981

---------------------------------------------------------------------------
Feedback: Neutral
Essay: 
      April 24, 2013 To whom it may concem: I highly recommend Jon Carll to you company. I worked with Mr. Carll for 8 years at Bell Shop. Mr. Carll was sales assistant at Bell Shop. He was

<h1>Level B1 bin percentages</h1>

In [6]:
level = "b1"
path_to_bins = "bins/bin_values_level_"+level+".xml"
file = open(path_to_bins, "r")
bs = BS(file, "lxml")

binning_criterias = []

features = bs.find_all("feature")

feature_indicies = []
means = []
stdds = []

for index, feature in enumerate(features):
    feature_indicies.append(int(feature.find("feature_index").text))
    means.append(float(feature.find("mean").text))
    stdds.append(float(feature.find("std").text))

feature_indicies = np.array(feature_indicies).round(4)
means = np.array(means).round(4)
stdds = np.array(stdds).round(4)

data = np.vstack((feature_indicies, feature_names[binning_indicies], means, stdds, (means-stdds).round(4), (means+stdds).round(4)))
#print(data)
columns = ["Feature Index", "Feature code", "Mean", "Standard deviation", "mean-std", "mean+std"]

df = pd.DataFrame(data.T, columns=columns)


#read essays
essay_file = "dataset_binned/level_" + level + ".xml"
essays = []
grades = []
index = 0
with open(essay_file, "r") as f:
    bs = BS(f, "lxml")
    writings = bs.find_all("writing")
    for writing in writings:
        #if np.random.normal() < 0.1:
        #    continue
        essays.append(writing.find("text").text)
        grades.append(int(writing.find("grade").text))
        index += 1
        if index == 1000:
            break

bins = []

grades = np.array(grades)
feature_matrix = np.load("all_features_1000/feature_matrix_"+level+"_fm.npy")
grades_test = np.load("all_features_1000/feature_matrix_"+level+"_grades.npy")
assert (grades_test == grades).all()

for index, mean, std in zip(feature_indicies, means, stdds):
    bin1 = len(feature_matrix.T[index, feature_matrix.T[index] <= (mean-std)])
    bin2 = len(feature_matrix.T[index, ((feature_matrix.T[index] <= mean) &  (feature_matrix.T[index] > (mean - std)))])
    bin3 = len(feature_matrix.T[index, ((feature_matrix.T[index] < (mean+std)) &  (feature_matrix.T[index] > mean)) ])
    bin4 = len(feature_matrix.T[index, feature_matrix.T[index] >= (mean+std)]  )
    
    print("Feature code: ", feature_names[index])
    print("Bin1: {}%".format(bin1/10))
    print("Bin2: {}%".format(bin2/10))
    print("Bin3: {}%".format(bin3/10))
    print("Bin4: {}%".format(bin4/10))
    print()
    
    negative_feedback_index = np.argmax((feature_matrix.T[index] <= mean-std)) if ((feature_matrix.T[index] <= mean-std) != False).any() else -1
    neutral_feedback_index = np.argmax((feature_matrix.T[index] > (mean-std)) & (feature_matrix.T[index] < (mean+std))) if ((feature_matrix.T[index] > (mean-std)) & (feature_matrix.T[index] < (mean+std)) != False).any() else -1
    positive_feedback_index = np.argmax((feature_matrix.T[index] >= mean+std)) if ((feature_matrix.T[index] >= mean+std) != False).any() else -1
    
    if feature_names[index] == "LA_ER" or feature_names[index] == "LA_COL_ERR_R" or feature_names[index] == "CA_BIN1_R":
        temp = negative_feedback_index
        negative_feedback_index = positive_feedback_index
        positive_feedback_index = temp
    
    print("3 sample essays")
    print("Mean: {}, std: {}, Mean-std: {}, mean+std: {}".format(mean, std, mean-std, mean+std))
    print("---------------------------------------------------------------------------")
    print("Feedback: Positive")
    print("Essay: {}".format(essays[positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n".format(feature_matrix.T[index, positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("---------------------------------------------------------------------------")
    print("Feedback: Neutral")
    print("Essay: {}".format(essays[neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n".format(feature_matrix.T[index, neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("---------------------------------------------------------------------------")
    print("Feedback: Negative")
    print("Essay: {}".format(essays[negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n\n".format(feature_matrix.T[index, negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("===========================================================================")

Feature code:  LV_HDD
Bin1: 14.3%
Bin2: 34.7%
Bin3: 37.2%
Bin4: 13.8%

3 sample essays
Mean: 0.8024, std: 0.0428, Mean-std: 0.7596, mean+std: 0.8452
---------------------------------------------------------------------------
Feedback: Positive
Essay: 
      I think education have some trouble in China.There are a lot of tests and plenty of homework. I 'm totally against that.  Seldom older people to go back to school. There are ranking in China. People are like comparing each other. Private school is popular in daily China, it's so expansive but seem every parents want the childrens join in it. Every student have to study hard for famous clloeges: such as QingHuang University and BeiJing University.it's said &quot;Thousands of soldiers and horses go through bridge&quot;.Of course .Chinese students are very bright and clever.
    
Grade: 90
Feature value: 0.8539977875338487

---------------------------------------------------------------------------
Feedback: Neutral
Essay: 
      Dear 

<h1>Level B2 bin percentages</h1>

In [7]:
level = "b2"
path_to_bins = "bins/bin_values_level_"+level+".xml"
file = open(path_to_bins, "r")
bs = BS(file, "lxml")

binning_criterias = []

features = bs.find_all("feature")

feature_indicies = []
means = []
stdds = []

for index, feature in enumerate(features):
    feature_indicies.append(int(feature.find("feature_index").text))
    means.append(float(feature.find("mean").text))
    stdds.append(float(feature.find("std").text))

feature_indicies = np.array(feature_indicies).round(4)
means = np.array(means).round(4)
stdds = np.array(stdds).round(4)

data = np.vstack((feature_indicies, feature_names[binning_indicies], means, stdds, (means-stdds).round(4), (means+stdds).round(4)))
#print(data)
columns = ["Feature Index", "Feature code", "Mean", "Standard deviation", "mean-std", "mean+std"]

df = pd.DataFrame(data.T, columns=columns)


#read essays
essay_file = "dataset_binned/level_" + level + ".xml"
essays = []
grades = []
index = 0
with open(essay_file, "r") as f:
    bs = BS(f, "lxml")
    writings = bs.find_all("writing")
    for writing in writings:
        #if np.random.normal() < 0.1:
        #    continue
        essays.append(writing.find("text").text)
        grades.append(int(writing.find("grade").text))
        index += 1
        if index == 1000:
            break

bins = []

grades = np.array(grades)
feature_matrix = np.load("all_features_1000/feature_matrix_"+level+"_fm.npy")
grades_test = np.load("all_features_1000/feature_matrix_"+level+"_grades.npy")
assert (grades_test == grades).all()

for index, mean, std in zip(feature_indicies, means, stdds):
    bin1 = len(feature_matrix.T[index, feature_matrix.T[index] <= (mean-std)])
    bin2 = len(feature_matrix.T[index, ((feature_matrix.T[index] <= mean) &  (feature_matrix.T[index] > (mean - std)))])
    bin3 = len(feature_matrix.T[index, ((feature_matrix.T[index] < (mean+std)) &  (feature_matrix.T[index] > mean)) ])
    bin4 = len(feature_matrix.T[index, feature_matrix.T[index] >= (mean+std)]  )
    
    print("Feature code: ", feature_names[index])
    print("Bin1: {}%".format(bin1/10))
    print("Bin2: {}%".format(bin2/10))
    print("Bin3: {}%".format(bin3/10))
    print("Bin4: {}%".format(bin4/10))
    print()
    
    negative_feedback_index = np.argmax((feature_matrix.T[index] <= mean-std)) if ((feature_matrix.T[index] <= mean-std) != False).any() else -1
    neutral_feedback_index = np.argmax((feature_matrix.T[index] > (mean-std)) & (feature_matrix.T[index] < (mean+std))) if ((feature_matrix.T[index] > (mean-std)) & (feature_matrix.T[index] < (mean+std)) != False).any() else -1
    positive_feedback_index = np.argmax((feature_matrix.T[index] >= mean+std)) if ((feature_matrix.T[index] >= mean+std) != False).any() else -1
    
    if feature_names[index] == "LA_ER" or feature_names[index] == "LA_COL_ERR_R" or feature_names[index] == "CA_BIN1_R":
        temp = negative_feedback_index
        negative_feedback_index = positive_feedback_index
        positive_feedback_index = temp
    
    print("3 sample essays")
    print("Mean: {}, std: {}, Mean-std: {}, mean+std: {}".format(mean, std, mean-std, mean+std))
    print("---------------------------------------------------------------------------")
    print("Feedback: Positive")
    print("Essay: {}".format(essays[positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n".format(feature_matrix.T[index, positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("---------------------------------------------------------------------------")
    print("Feedback: Neutral")
    print("Essay: {}".format(essays[neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n".format(feature_matrix.T[index, neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("---------------------------------------------------------------------------")
    print("Feedback: Negative")
    print("Essay: {}".format(essays[negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n\n".format(feature_matrix.T[index, negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("===========================================================================")

Feature code:  LV_HDD
Bin1: 11.6%
Bin2: 33.8%
Bin3: 43.1%
Bin4: 11.5%

3 sample essays
Mean: 0.807, std: 0.043, Mean-std: 0.764, mean+std: 0.8500000000000001
---------------------------------------------------------------------------
Feedback: Positive
Essay: 
      In my own opinion a successful leader becames with hard work, humility, honesty and dedication, but what is leadership?, I think it's having a most important responsability at any moment, in school, at work or in any kind of team. A good leader teaches, helps, listens, and understand what the problem or situation is about. A good leader never complains, never desistes, take care others. a good leader persuades, motivates, shows the way to resolve. When a leader stop listens its collegue probably it's time to renovate. anyone can be an excellent leader, but actually anybody does not want to, because of our culture, or ourway to think. I diarly encourage my co-workers to do their best every day, I diarly motivate myself.
    

<h1>Level C1 bin percentages</h1>

In [8]:
level = "c1"
path_to_bins = "bins/bin_values_level_"+level+".xml"
file = open(path_to_bins, "r")
bs = BS(file, "lxml")

binning_criterias = []

features = bs.find_all("feature")

feature_indicies = []
means = []
stdds = []

for index, feature in enumerate(features):
    feature_indicies.append(int(feature.find("feature_index").text))
    means.append(float(feature.find("mean").text))
    stdds.append(float(feature.find("std").text))

feature_indicies = np.array(feature_indicies).round(4)
means = np.array(means).round(4)
stdds = np.array(stdds).round(4)

data = np.vstack((feature_indicies, feature_names[binning_indicies], means, stdds, (means-stdds).round(4), (means+stdds).round(4)))
#print(data)
columns = ["Feature Index", "Feature code", "Mean", "Standard deviation", "mean-std", "mean+std"]

df = pd.DataFrame(data.T, columns=columns)


#read essays
essay_file = "dataset_binned/level_" + level + ".xml"
essays = []
grades = []
index = 0
with open(essay_file, "r") as f:
    bs = BS(f, "lxml")
    writings = bs.find_all("writing")
    for writing in writings:
        #if np.random.normal() < 0.1:
        #    continue
        essays.append(writing.find("text").text)
        grades.append(int(writing.find("grade").text))
        index += 1
        if index == 1000:
            break

bins = []

grades = np.array(grades)
feature_matrix = np.load("all_features_1000/feature_matrix_"+level+"_fm.npy")
grades_test = np.load("all_features_1000/feature_matrix_"+level+"_grades.npy")
assert (grades_test == grades).all()

for index, mean, std in zip(feature_indicies, means, stdds):
    bin1 = len(feature_matrix.T[index, feature_matrix.T[index] <= (mean-std)])
    bin2 = len(feature_matrix.T[index, ((feature_matrix.T[index] <= mean) &  (feature_matrix.T[index] > (mean - std)))])
    bin3 = len(feature_matrix.T[index, ((feature_matrix.T[index] < (mean+std)) &  (feature_matrix.T[index] > mean)) ])
    bin4 = len(feature_matrix.T[index, feature_matrix.T[index] >= (mean+std)]  )
    
    print("Feature code: ", feature_names[index])
    print("Bin1: {}%".format(bin1/10))
    print("Bin2: {}%".format(bin2/10))
    print("Bin3: {}%".format(bin3/10))
    print("Bin4: {}%".format(bin4/10))
    print()
    
    negative_feedback_index = np.argmax((feature_matrix.T[index] <= mean-std)) if ((feature_matrix.T[index] <= mean-std) != False).any() else -1
    neutral_feedback_index = np.argmax((feature_matrix.T[index] > (mean-std)) & (feature_matrix.T[index] < (mean+std))) if ((feature_matrix.T[index] > (mean-std)) & (feature_matrix.T[index] < (mean+std)) != False).any() else -1
    positive_feedback_index = np.argmax((feature_matrix.T[index] >= mean+std)) if ((feature_matrix.T[index] >= mean+std) != False).any() else -1
    
    if feature_names[index] == "LA_ER" or feature_names[index] == "LA_COL_ERR_R" or feature_names[index] == "CA_BIN1_R":
        temp = negative_feedback_index
        negative_feedback_index = positive_feedback_index
        positive_feedback_index = temp
    
    print("3 sample essays")
    print("Mean: {}, std: {}, Mean-std: {}, mean+std: {}".format(mean, std, mean-std, mean+std))
    print("---------------------------------------------------------------------------")
    print("Feedback: Positive")
    print("Essay: {}".format(essays[positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n".format(feature_matrix.T[index, positive_feedback_index] if positive_feedback_index != -1 else "No sample"))
    print("---------------------------------------------------------------------------")
    print("Feedback: Neutral")
    print("Essay: {}".format(essays[neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n".format(feature_matrix.T[index, neutral_feedback_index] if neutral_feedback_index != -1 else "No sample"))
    print("---------------------------------------------------------------------------")
    print("Feedback: Negative")
    print("Essay: {}".format(essays[negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("Grade: {}".format(grades[negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("Feature value: {}\n\n".format(feature_matrix.T[index, negative_feedback_index] if negative_feedback_index != -1 else "No sample"))
    print("===========================================================================")

Feature code:  LV_HDD
Bin1: 15.3%
Bin2: 32.4%
Bin3: 37.5%
Bin4: 14.8%

3 sample essays
Mean: 0.8182, std: 0.0333, Mean-std: 0.7849, mean+std: 0.8515
---------------------------------------------------------------------------
Feedback: Positive
Essay: 
      I think that every company should build a CSR program. It's not only my opinion, because in Italy even our constitutional law, at his article 41, says that &quot;...the economic activity, both public and private, will be addressed and coordinated towards social purposes&quot;. Other than the law, many companies build projects that underline their giving back to the society. Surely, this will guarantee them some news coverage, but these actions will provide good outcomes. For instance, many companies are sponsoring small sport teams, even in minor sports that otherwise wouldn't have the money to make their activities. Or, other companies devolve some hours of their workers to help the community to maintain public goods like parks, ga