In [151]:
import plotly
import pandas as pd 
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

plotly.offline.init_notebook_mode(connected=True)

In [203]:
data = pd.read_csv("wiki4HE.csv", sep=";") 


In [204]:
# Limpeza dos dados 

## Substituição de células com conteúdo "?"
data = data.replace('?', np.nan)


data.PhD = data.PhD.astype("bool")
data.ENJ1 = pd.to_numeric(data.ENJ1).copy()
data.ENJ2 = pd.to_numeric(data.ENJ2).copy()

In [205]:
data.describe()

Unnamed: 0,AGE,GENDER,UNIVERSITY,ENJ1,ENJ2
count,913.0,913.0,913.0,906.0,896.0
mean,42.24644,0.424973,1.123768,3.794702,3.821429
std,8.058418,0.49461,0.329497,0.969031,0.88654
min,23.0,0.0,1.0,1.0,1.0
25%,36.0,0.0,1.0,3.0,3.0
50%,42.0,0.0,1.0,4.0,4.0
75%,47.0,1.0,1.0,4.0,4.0
max,69.0,1.0,2.0,5.0,5.0


In [206]:
data.shape

(913, 53)

In [207]:
for column in data.columns[10:]:
    data[column] = pd.to_numeric(data[column])
data.dtypes
pd.DataFrame(data.dtypes, columns=['Type'])

Unnamed: 0,Type
AGE,int64
GENDER,int64
DOMAIN,object
PhD,bool
YEARSEXP,object
UNIVERSITY,int64
UOC_POSITION,object
OTHER_POSITION,object
OTHERSTATUS,object
USERWIKI,object


In [208]:
data

Unnamed: 0,AGE,GENDER,DOMAIN,PhD,YEARSEXP,UNIVERSITY,UOC_POSITION,OTHER_POSITION,OTHERSTATUS,USERWIKI,...,BI2,Inc1,Inc2,Inc3,Inc4,Exp1,Exp2,Exp3,Exp4,Exp5
0,40,0,2,True,14,1,2,,,0,...,3.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,1.0,2.0
1,42,0,5,True,18,1,2,,,0,...,2.0,4.0,4.0,3.0,4.0,2.0,2.0,4.0,2.0,4.0
2,37,0,4,True,13,1,3,,,0,...,1.0,5.0,3.0,5.0,5.0,2.0,2.0,2.0,1.0,3.0
3,40,0,4,False,13,1,3,,,0,...,3.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,3.0,4.0
4,51,0,6,False,8,1,3,,,1,...,5.0,5.0,5.0,4.0,4.0,5.0,5.0,5.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908,43,0,5,True,21,2,,,2,0,...,2.0,2.0,2.0,2.0,2.0,,,,,
909,53,0,6,False,25,2,,,6,0,...,4.0,4.0,3.0,3.0,4.0,4.0,4.0,4.0,1.0,1.0
910,39,0,5,True,9,2,,,4,0,...,2.0,5.0,4.0,3.0,,5.0,5.0,5.0,4.0,1.0
911,40,0,3,True,10,2,,,2,0,...,5.0,1.0,5.0,2.0,2.0,4.0,4.0,2.0,1.0,1.0


In [143]:
fig = px.histogram(data, x="YEARSEXP")
fig.show()

In [142]:
fig = px.histogram(data, x="AGE")
fig.show()

In [92]:
males = (data.GENDER == 0).sum()
females = (data.GENDER == 1).sum()

labels = ['Males','Females']
values = [males, females]
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()

In [93]:
#1=Arts & Humanities; 2=Sciences; 3=Health Sciences; 4=Engineering & Architecture; 5=Law & Politics

Arts = (data.DOMAIN == "1").sum()
Sciences = (data.DOMAIN == "2").sum()
Health = (data.DOMAIN == "3").sum()
Engineering = (data.DOMAIN == "4").sum()
Law = (data.DOMAIN == "5").sum()

labels = ['Arts & Humanities','Sciences','Health Sciences','Engineering & Architecture','Law & Politics']
values = [Arts, Sciences, Health, Engineering, Law]
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()


In [73]:
print("ENJ1 AGE ANALYSIS")

meanPositive_ENJ1 = data[(data.ENJ1 > 3)].AGE.mean()
meanNeutral_ENJ1 = data[(data.ENJ1 == 3)].AGE.mean()
meanNegative_ENJ1 = data[(data.ENJ1 < 3)].AGE.mean()

medianPositive_ENJ1 = data[(data.ENJ1 > 3)].AGE.median()
medianNeutral_ENJ1 = data[(data.ENJ1 == 3)].AGE.median()
medianNegative_ENJ1 = data[(data.ENJ1 < 3)].AGE.median()

modePositive_ENJ1 = data[(data.ENJ1 > 3)].AGE.mode()[0]
modeNeutral_ENJ1 = data[(data.ENJ1 == 3)].AGE.mode()[0]
modeNegative_ENJ1 = data[(data.ENJ1 < 3)].AGE.mode()[0]

table = pd.DataFrame(np.array([[meanPositive_ENJ1, medianPositive_ENJ1, modePositive_ENJ1], [meanNeutral_ENJ1, medianNeutral_ENJ1, modeNeutral_ENJ1], [meanNegative_ENJ1, medianNegative_ENJ1, modeNegative_ENJ1]]),
                     ['Positive', 'Neutral', 'Negative'], ['Mean', 'Median', 'Mode'])
table

ENJ1 AGE ANALYSIS


Unnamed: 0,Mean,Median,Mode
Positive,41.585526,41.0,42.0
Neutral,44.072464,43.0,50.0
Negative,42.131868,43.0,43.0


In [74]:
print("ENJ2 AGE ANALYSIS")

meanPositive_ENJ2 = data[(data.ENJ2 > 3)].AGE.mean()
meanNeutral_ENJ2 = data[(data.ENJ2 == 3)].AGE.mean()
meanNegative_ENJ2 = data[(data.ENJ2 < 3)].AGE.mean()

medianPositive_ENJ2 = data[(data.ENJ2 > 3)].AGE.median()
medianNeutral_ENJ2 = data[(data.ENJ2 == 3)].AGE.median()
medianNegative_ENJ2 = data[(data.ENJ2 < 3)].AGE.median()

modePositive_ENJ2 = data[(data.ENJ2 > 3)].AGE.mode()[0]
modeNeutral_ENJ2 = data[(data.ENJ2 == 3)].AGE.mode()[0]
modeNegative_ENJ2 = data[(data.ENJ2 < 3)].AGE.mode()[0]

table = pd.DataFrame(np.array([[meanPositive_ENJ2, medianPositive_ENJ2, modePositive_ENJ2], [meanNeutral_ENJ2, medianNeutral_ENJ2, modeNeutral_ENJ2], [meanNegative_ENJ2, medianNegative_ENJ2, modeNegative_ENJ2]]),
                     ['Positive', 'Neutral', 'Negative'], ['Mean', 'Median', 'Mode'])
table

ENJ2 AGE ANALYSIS


Unnamed: 0,Mean,Median,Mode
Positive,41.327243,41.0,38.0
Neutral,44.053571,43.0,39.0
Negative,43.628571,44.0,43.0


In [144]:
positive_ENJ1 = (data[((data.ENJ1 > 3) & (data.USERWIKI == "1"))]).shape[0]
negative_ENJ1 = (data[((data.ENJ1 < 3) & (data.USERWIKI == "1"))]).shape[0]
neutral_ENJ1 = (data[((data.ENJ1 == 3) & (data.USERWIKI == "1"))]).shape[0]




table = pd.DataFrame(np.array([positive_ENJ1,neutral_ENJ1, negative_ENJ1]),
                     ['Positive', 'Neutral', 'Negative',], ['count'])



fig = px.pie(table, values='count', names= ['Positive', 'Neutral', 'Negative'], 
             color_discrete_sequence=px.colors.sequential.RdBu, hole=.3, 
             title='User Distribution')
fig.show()
table



Unnamed: 0,count
Positive,90
Neutral,26
Negative,7


In [147]:
positive_ENJ2 = (data[((data.ENJ2 > 3) & (data.USERWIKI == "1"))]).shape[0]
negative_ENJ2 = (data[((data.ENJ2 < 3) & (data.USERWIKI == "1"))]).shape[0]
neutral_ENJ2 = (data[((data.ENJ2 == 3) & (data.USERWIKI == "1"))]).shape[0]




table = pd.DataFrame(np.array([positive_ENJ2,neutral_ENJ2, negative_ENJ2]),
                     ['Positive', 'Neutral', 'Negative',], ['count'])



fig = px.pie(table, values='count', names= ['Positive', 'Neutral', 'Negative'], 
             color_discrete_sequence=px.colors.sequential.RdBu, hole=.3, 
             title='User Distribution')
fig.show()
table


Unnamed: 0,count
Positive,88
Neutral,27
Negative,6


In [145]:
positive_ENJ1 = (data[((data.ENJ1 > 3) & (data.USERWIKI == "0"))]).shape[0]
negative_ENJ1 = (data[((data.ENJ1 < 3) & (data.USERWIKI == "0"))]).shape[0]

neutral_ENJ1 = (data[((data.ENJ1 == 3) & (data.USERWIKI == "0"))]).shape[0]



table = pd.DataFrame(np.array([positive_ENJ1, neutral_ENJ1, negative_ENJ1]),
                     ['Positive', 'Neutral', 'Negative'], ['count'])



fig = px.pie(table, values='count', names= ['Positive', 'Neutral', 'Negative'], 
             color_discrete_sequence=px.colors.sequential.RdBu, hole=.3, 
             title='User Distribution')
fig.show()
table


Unnamed: 0,count
Positive,515
Neutral,180
Negative,84


In [148]:
positive_ENJ2 = (data[((data.ENJ2 > 3) & (data.USERWIKI == "0"))]).shape[0]
negative_ENJ2 = (data[((data.ENJ2 < 3) & (data.USERWIKI == "0"))]).shape[0]

neutral_ENJ2 = (data[((data.ENJ2 == 3) & (data.USERWIKI == "0"))]).shape[0]



table = pd.DataFrame(np.array([positive_ENJ2, neutral_ENJ2, negative_ENJ2]),
                     ['Positive', 'Neutral', 'Negative'], ['count'])



fig = px.pie(table, values='count', names= ['Positive', 'Neutral', 'Negative'], 
             color_discrete_sequence=px.colors.sequential.RdBu, hole=.3, 
             title='User Distribution')
fig.show()
table

Unnamed: 0,count
Positive,512
Neutral,195
Negative,64


In [135]:
positive_ENJ1_notuser = (data[((data.ENJ1 > 3) & (data.USERWIKI == "0"))]).shape[0]
positive_ENJ1_user = (data[((data.ENJ1 > 3) & (data.USERWIKI == "1"))]).shape[0]





table = pd.DataFrame(np.array([positive_ENJ1_user, positive_ENJ1_notuser]),
                     ['User', 'Not User'], ['count'])



fig = px.pie(table, values='count', names= ['User', 'Not User'], 
             color_discrete_sequence=px.colors.sequential.RdBu, hole=.3, 
             title='User Distribution')
fig.show()
table

Unnamed: 0,count
User,90
Not User,515


In [149]:
positive_ENJ2_notuser = (data[((data.ENJ2 > 3) & (data.USERWIKI == "0"))]).shape[0]
positive_ENJ2_user = (data[((data.ENJ2 > 3) & (data.USERWIKI == "1"))]).shape[0]





table = pd.DataFrame(np.array([positive_ENJ2_user, positive_ENJ2_notuser]),
                     ['User', 'Not User'], ['count'])



fig = px.pie(table, values='count', names= ['User', 'Not User'], 
             color_discrete_sequence=px.colors.sequential.RdBu, hole=.3, 
             title='User Distribution')
fig.show()
table

Unnamed: 0,count
User,88
Not User,512


In [146]:
negative_ENJ1_notuser = (data[((data.ENJ1 < 3) & (data.USERWIKI == "0"))]).shape[0]
negative_ENJ1_user = (data[((data.ENJ1 < 3) & (data.USERWIKI == "1"))]).shape[0]


table = pd.DataFrame(np.array([negative_ENJ1_user, negative_ENJ1_notuser]),
                     ['User', 'Not User'], ['count'])



fig = px.pie(table, values='count', names= ['User', 'Not User'], 
             color_discrete_sequence=px.colors.sequential.RdBu, hole=.3, 
             title='User Distribution')
fig.show()
table

Unnamed: 0,count
User,7
Not User,84


In [150]:
negative_ENJ2_notuser = (data[((data.ENJ2 < 3) & (data.USERWIKI == "0"))]).shape[0]
negative_ENJ2_user = (data[((data.ENJ2 < 3) & (data.USERWIKI == "1"))]).shape[0]


table = pd.DataFrame(np.array([negative_ENJ2_user, negative_ENJ2_notuser]),
                     ['User', 'Not User'], ['count'])



fig = px.pie(table, values='count', names= ['User', 'Not User'], 
             color_discrete_sequence=px.colors.sequential.RdBu, hole=.3, 
             title='User Distribution')
fig.show()
table

Unnamed: 0,count
User,6
Not User,64


In [130]:
positive_ENJ1_isPhD = (data[((data.ENJ1 > 3) & (data.PhD == True) & (data.USERWIKI == "0"))]).shape[0]
positive_ENJ1_user = (data[((data.ENJ1 > 3) & (data.PhD == False) & (data.USERWIKI == "1"))]).shape[0]
positive_ENJ1_userPhD = (data[((data.ENJ1 > 3) & (data.PhD == True) & (data.USERWIKI == "1"))]).shape[0]


print(positive_ENJ1_isPhD)
print(positive_ENJ1_user)
print(positive_ENJ1_userPhD)


table = pd.DataFrame(np.array([positive_ENJ1_isPhD, positive_ENJ1_user, positive_ENJ1_userPhD]),
                     ['PhD', 'UserWiki', 'PhD and UserWiki'], ['count'])



fig = px.pie(table, values='count', names= ['PhD', 'UserWiki', 'PhD and UserWiki'], 
             color_discrete_sequence=px.colors.sequential.RdBu, hole=.3, 
             title='User Distribution')
fig.show()
table

232
50
40


Unnamed: 0,count
PhD,232
UserWiki,50
PhD and UserWiki,40


In [131]:
ENJ1_isPhD = (data[((data.ENJ1 < 3) & (data.PhD == True) & (data.USERWIKI == "0"))]).shape[0]
ENJ1_user = (data[((data.ENJ1 < 3) & (data.PhD == False) & (data.USERWIKI == "1"))]).shape[0]
ENJ1_userPhD = (data[((data.ENJ1 < 3) & (data.PhD == True) & (data.USERWIKI == "1"))]).shape[0]


print(positive_ENJ1_isPhD)
print(positive_ENJ1_user)
print(positive_ENJ1_userPhD)

table = pd.DataFrame(np.array([ENJ1_isPhD, ENJ1_user, ENJ1_userPhD]),
                     ['PhD', 'UserWiki', 'PhD and UserWiki'], ['count'])



fig = px.pie(table, values='count', names= ['PhD', 'UserWiki', 'PhD and UserWiki'], 
             color_discrete_sequence=px.colors.sequential.RdBu, hole=.3, 
             title='User Distribution')
fig.show()
table

232
50
40


Unnamed: 0,count
PhD,44
UserWiki,5
PhD and UserWiki,2


In [132]:
positive_ENJ2_isPhD = (data[((data.ENJ2 > 3) & (data.PhD == True) & (data.USERWIKI == "0"))]).shape[0]
positive_ENJ2_user = (data[((data.ENJ2 > 3) & (data.PhD == False) & (data.USERWIKI == "1"))]).shape[0]
positive_ENJ2_userPhD = (data[((data.ENJ2 > 3) & (data.PhD == True) & (data.USERWIKI == "1"))]).shape[0]


print(positive_ENJ2_isPhD)
print(positive_ENJ2_user)
print(positive_ENJ2_userPhD)


table = pd.DataFrame(np.array([positive_ENJ2_isPhD, positive_ENJ2_user, positive_ENJ2_userPhD]),
                     ['PhD', 'UserWiki', 'PhD and UserWiki'], ['count'])



fig = px.pie(table, values='count', names= ['PhD', 'UserWiki', 'PhD and UserWiki'], 
             color_discrete_sequence=px.colors.sequential.RdBu, hole=.3, 
             title='User Distribution')
fig.show()
table

237
48
40


Unnamed: 0,count
PhD,237
UserWiki,48
PhD and UserWiki,40


In [133]:
ENJ2_isPhD = (data[((data.ENJ2 < 3) & (data.PhD == True) & (data.USERWIKI == "0"))]).shape[0]
ENJ2_user = (data[((data.ENJ2 < 3) & (data.PhD == False) & (data.USERWIKI == "1"))]).shape[0]
ENJ2_userPhD = (data[((data.ENJ2 < 3) & (data.PhD == True) & (data.USERWIKI == "1"))]).shape[0]


print(positive_ENJ2_isPhD)
print(positive_ENJ2_user)
print(positive_ENJ2_userPhD)

table = pd.DataFrame(np.array([ENJ2_isPhD, ENJ2_user, ENJ2_userPhD]),
                     ['PhD', 'UserWiki', 'PhD and UserWiki'], ['count'])



fig = px.pie(table, values='count', names= ['PhD', 'UserWiki', 'PhD and UserWiki'], 
             color_discrete_sequence=px.colors.sequential.RdBu, hole=.3, 
             title='User Distribution')
fig.show()
table

237
48
40


Unnamed: 0,count
PhD,34
UserWiki,4
PhD and UserWiki,2


In [154]:
fig = px.histogram(data, x="ENJ1")
fig.show()

In [155]:
fig = px.histogram(data, x="ENJ2")
fig.show()

In [185]:
fig = px.bar(data, x='Use1', y='UNIVERSITY')
fig.show()

In [209]:

data['score'] = pd.Series([0]*data.shape[0])

In [210]:
for row in data.index:
    groupValues = []
    for column in data.columns[10:]:
        myvalue = data.loc[row, column]
        if not np.isnan(myvalue):
            groupValues.append(data.loc[row, column])
    data.loc[row, 'score'] = np.mean(groupValues)

In [211]:
data

Unnamed: 0,AGE,GENDER,DOMAIN,PhD,YEARSEXP,UNIVERSITY,UOC_POSITION,OTHER_POSITION,OTHERSTATUS,USERWIKI,...,Inc1,Inc2,Inc3,Inc4,Exp1,Exp2,Exp3,Exp4,Exp5,score
0,40,0,2,True,14,1,2,,,0,...,5.0,5.0,5.0,5.0,4.0,4.0,4.0,1.0,2.0,3.022727
1,42,0,5,True,18,1,2,,,0,...,4.0,4.0,3.0,4.0,2.0,2.0,4.0,2.0,4.0,2.636364
2,37,0,4,True,13,1,3,,,0,...,5.0,3.0,5.0,5.0,2.0,2.0,2.0,1.0,3.0,2.590909
3,40,0,4,False,13,1,3,,,0,...,3.0,4.0,4.0,3.0,4.0,4.0,3.0,3.0,4.0,3.363636
4,51,0,6,False,8,1,3,,,1,...,5.0,5.0,4.0,4.0,5.0,5.0,5.0,4.0,4.0,4.090909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908,43,0,5,True,21,2,,,2,0,...,2.0,2.0,2.0,2.0,,,,,,2.675676
909,53,0,6,False,25,2,,,6,0,...,4.0,3.0,3.0,4.0,4.0,4.0,4.0,1.0,1.0,3.214286
910,39,0,5,True,9,2,,,4,0,...,5.0,4.0,3.0,,5.0,5.0,5.0,4.0,1.0,3.047619
911,40,0,3,True,10,2,,,2,0,...,1.0,5.0,2.0,2.0,4.0,4.0,2.0,1.0,1.0,3.372093


In [224]:
males = (data[data["score"] >=  data["score"].mean()].GENDER == 0).sum()
females = (data[data["score"] >=  data["score"].mean()].GENDER == 1).sum()
labels = ['Males','Females']
values = [males, females]
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()

In [225]:
males = (data[data["score"] >=  data["score"].mean()].PhD == 0).sum()
females = (data[data["score"] >=  data["score"].mean()].PhD == 1).sum()
labels = ['No PhD','PhD']
values = [males, females]
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()

In [228]:
fig = px.scatter(data, x='AGE', y='YEARSEXP')
fig.show()